diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..3e4e96171cb122fd7de3ef9c6ba4e92c40bc1414 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/22/f22et4hxfdbezzlil53lu2pcyq6hgd3sgpjtfwxqwvo3nhcfcvx3/iqgyrwyxmlbu22glkzz24rsbjiieww3sllxux2ttk4c6gtoiuba filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/6v/f6v2dym5xl4l4b2xlv35ic4ajld4mxcvhsvdsiwx2uug77q36cad/nhugjvtrt6cm53zizhnw673hj52m5m3kegaqvkjzkf4qh6d6rhm filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/ah/fahbtdmoejcqs352pnbnedqns63nbnu6hdbrwzvf6chptnsannjh/fhpbiokcxxh7ksbfgiljcvh7erywotuv4ddvlfcb4fk2ef7dd5c filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/bn/fbnlruhvmagcngqd5is2xjbucjaq7uf3sgsbdahfi6ovtehbhzyo/62gizmmmqz43cclymnr7ftyo5qt7ux4og6bc2xazrwstkjpsy2e filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/e2/fe2tjoiexjbavh5sakfaxvga43vsvwn5ev5bzhfjg76jvmtjqtbn/ejg3u4qymaxsvvl2vdequli7pwsrdjf5zdgqjkgbrxdsvgfv3h4 filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/fr/ffrx7clryowwzulnhruopihutvaxlycymqopsyoha6yecifyw2m2/g3wh462wylribiwz4th3gnlt5rtnrcb5bkad6w3yucxodv3q5ks filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/k6/fk6cfyjfeiu7xe6ebkapsnixuplqczgfc5534mitqsfkssbzjyak/4xid4w6sg2yg7xaseouf2vwhp2fyff56a2t6z6ownb3yw3g25rk filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/kj/fkjh2kykxecmnv6oe3zzwtjpek77nmrm35vgv2daxfgkim6xfk4u/gigbnwpmixz5epksvvrh4mtg3nxlpy724eojb3ajzvtepgvx7y4 filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/n6/fn6x7m44e35jdmh6iqj3eqiyrz7tbhzd3rqartt67myyrnickjmp/um3sgirsxogup4murdiaoy7dxu4ogolqsa343kh56kq24zd53fb filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/te/fte5y7bccssideiluepvpscj6srf7orxnfgql6to32ni27zf2uv2/vmrpdvw3meiqsf22oras6imorrybogkgp6jjr3ddtreaiuutais filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/tz/ftzd5ordyehsowurkwjjpkso24gayhyplcd6wz7xdv53fad276l6/36luuy7klcmb7554z63umrezysn6xbat5wxfaadxh4clxhcn2j7 filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/u4/fu47dchf76mmiajgnawm3xgek4ysnnmqaavupgy4cddyxygid6iq/725pjrxppjygb6kbca6zklwmn7iunv65thw23b5s4im6zi27j3i filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/uy/fuygegwmldon4qz3wvjs3cld4hnjz6yxh6aa2cmfsal4u3xxws43/l4w2mroymid3qdffvzt4wffavpm5it6rzi6lmbvxzzezfkbavuo filter=lfs diff=lfs merge=lfs -text +torchinductor/fxgraph/w5/fw5vzdkweh3kv3fm3mnal4wu63gxhw2anwx2pzuved4acfz4fdzm/n5dfreyro3slkufuydw2d54nm7bfiskxjjasoyg2z5yept5c3rf filter=lfs diff=lfs merge=lfs -text diff --git a/meta.json b/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..e7ad09fa1fd47d6e160658e55a91e1da64044e8a --- /dev/null +++ b/meta.json @@ -0,0 +1,44 @@ +{ + "cache_layout_version": 1, + "created_at": "2026-01-23T07:13:39Z", + "model_path": "/root/.cache/huggingface/hub/models--black-forest-labs--FLUX.2-klein-9B/snapshots/cd1bba5810fe2aba6666d9cf7352e25436426039", + "compile_command": [ + "/usr/bin/python", + "/app/tensorrt_llm/visual_gen/examples/flux2_klein_9b.py", + "--model_path", + "/root/.cache/huggingface/hub/models--black-forest-labs--FLUX.2-klein-9B/snapshots/cd1bba5810fe2aba6666d9cf7352e25436426039", + "--height", + "512", + "--width", + "1024", + "--num_inference_steps", + "4", + "--num_images", + "6", + "--linear_type", + "te-fp8-per-tensor", + "--fallback_linear_type", + "default", + "--torch_compile_mode", + "default", + "--offload_text_encoder" + ], + "height": 512, + "width": 1024, + "num_inference_steps": 4, + "num_images": 6, + "linear_type": "te-fp8-per-tensor", + "fallback_linear_type": "default", + "torch_compile_mode": "default", + "offload_text_encoder": true, + "offload_vae": false, + "disable_cuda_graph": false, + "disable_teacache": false, + "torch_version": "2.10.0a0+b4e4ee81d3.nv25.12", + "cuda_version": "13.1", + "device_name": "NVIDIA GeForce RTX 4090", + "device_capability": [ + 8, + 9 + ] +} \ No newline at end of file diff --git a/torchinductor/2h/a581feca05a976cd76073f2f954a7641097b9c5775b12cf6831b3149d528a8b4.best_config b/torchinductor/2h/a581feca05a976cd76073f2f954a7641097b9c5775b12cf6831b3149d528a8b4.best_config new file mode 100644 index 0000000000000000000000000000000000000000..d289cec8e8a34a7e59e27d318f3de362d687b99b --- /dev/null +++ b/torchinductor/2h/a581feca05a976cd76073f2f954a7641097b9c5775b12cf6831b3149d528a8b4.best_config @@ -0,0 +1 @@ +{"XBLOCK": 64, "YBLOCK": 64, "num_warps": 8, "num_stages": 1, "configs_hash": "1ce421918d79ed0f7edb09d0ee64f016daf650a007a21866fe52d592be55380c", "found_by_coordesc": false, "time_taken_ms": 143, "triton_cache_hash": "RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA"} \ No newline at end of file diff --git a/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py b/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py new file mode 100644 index 0000000000000000000000000000000000000000..8fbb9cd5967793d73bf715bc65ea3cb93bdb0d48 --- /dev/null +++ b/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py @@ -0,0 +1,70 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'y': 131072, 'x': 128}, tile_hint=TileHint.DEFAULT, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*bf16', 'out_ptr0': '*bf16', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid2DWithYZOverflow', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__fused_rms_norm_cat_view_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'y': 589824, 'x': 75497984}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused__fused_rms_norm_cat_view_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 73728 + xnumel = 128 + yoffset = (tl.program_id(1) + tl.program_id(2) * tl.num_programs(1)) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + y1 = yindex // 32 + x2 = xindex + y0 = (yindex % 32) + y3 = yindex + tmp0 = y1 + tmp1 = tl.full([1, 1], 0, tl.int64) + tmp2 = tmp0 >= tmp1 + tmp3 = tl.full([1, 1], 256, tl.int64) + tmp4 = tmp0 < tmp3 + tmp5 = tl.load(in_ptr0 + (x2 + 128*y0 + 12288*(y1)), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp6 = tmp5.to(tl.float32) + tmp7 = tl.load(in_ptr1 + (tl.broadcast_to(y0 + 32*(y1), [YBLOCK, XBLOCK])), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0) + tmp8 = 128.0 + tmp9 = (tmp7 / tmp8) + tmp10 = 1e-06 + tmp11 = tmp9 + tmp10 + tmp12 = libdevice.rsqrt(tmp11) + tmp13 = tmp6 * tmp12 + tmp14 = tl.load(in_ptr2 + (tl.broadcast_to(x2, [YBLOCK, XBLOCK])), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp15 = tmp14.to(tl.float32) + tmp16 = tmp13 * tmp15 + tmp17 = tmp16.to(tl.float32) + tmp18 = tl.full(tmp17.shape, 0.0, tmp17.dtype) + tmp19 = tl.where(tmp4, tmp17, tmp18) + tmp20 = tmp0 >= tmp3 + tmp21 = tl.full([1, 1], 2304, tl.int64) + tmp22 = tmp0 < tmp21 + tmp23 = tl.load(in_ptr3 + (x2 + 128*y0 + 12288*((-256) + y1)), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp24 = tmp23.to(tl.float32) + tmp25 = tl.load(in_ptr4 + (tl.broadcast_to(y0 + 32*((-256) + y1), [YBLOCK, XBLOCK])), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0) + tmp26 = 128.0 + tmp27 = (tmp25 / tmp26) + tmp28 = 1e-06 + tmp29 = tmp27 + tmp28 + tmp30 = libdevice.rsqrt(tmp29) + tmp31 = tmp24 * tmp30 + tmp32 = tl.load(in_ptr5 + (tl.broadcast_to(x2, [YBLOCK, XBLOCK])), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp33 = tmp32.to(tl.float32) + tmp34 = tmp31 * tmp33 + tmp35 = tmp34.to(tl.float32) + tmp36 = tl.full(tmp35.shape, 0.0, tmp35.dtype) + tmp37 = tl.where(tmp20, tmp35, tmp36) + tmp38 = tl.where(tmp4, tmp19, tmp37) + tl.store(out_ptr0 + (x2 + 128*y3), tmp38, xmask & ymask) diff --git a/torchinductor/2o/c2oduffhka4c52657rppatcdtgtnibm42qywfo2spmul2dpsj6jj.py b/torchinductor/2o/c2oduffhka4c52657rppatcdtgtnibm42qywfo2spmul2dpsj6jj.py new file mode 100644 index 0000000000000000000000000000000000000000..10b6da72848e57b5999f4a1a9747babb3ae470d2 --- /dev/null +++ b/torchinductor/2o/c2oduffhka4c52657rppatcdtgtnibm42qywfo2spmul2dpsj6jj.py @@ -0,0 +1,297 @@ +# AOT ID: ['0_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py +# Topologically Sorted Source Nodes: [norm_hidden_states, add, mul, norm_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul] +# Source node to ATen node mapping: +# add => add_1 +# mul => mul_1 +# norm_hidden_states => add, convert_element_type, convert_element_type_1, mul, rsqrt, sub, var_mean +# norm_hidden_states_1 => add_2 +# Graph fragment: +# %arg0_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %arg1_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg1_1] +# %getitem_1 : Tensor "f32[1, 2048, 1][2048, 1, 2048]cuda:0" = PlaceHolder[target=getitem_1] +# %buf1 : Tensor "f32[1, 2048, 1][2048, 1, 2048]cuda:0" = PlaceHolder[target=buf1] +# %arg2_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg2_1] +# %convert_element_type : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg0_1, torch.float32), kwargs = {}) +# %var_mean : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type, [2]), kwargs = {correction: 0, keepdim: True}) +# %add_1 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg1_1, 1), kwargs = {}) +# %sub : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type, %getitem_1), kwargs = {}) +# %add : Tensor "f32[1, 2048, 1][2048, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem, 1e-06), kwargs = {}) +# %rsqrt : Tensor "f32[1, 2048, 1][2048, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {}) +# %mul : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, %rsqrt), kwargs = {}) +# %convert_element_type_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {}) +# %mul_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_1, %convert_element_type_1), kwargs = {}) +# %add_2 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_1, %arg2_1), kwargs = {}) +# return %getitem_1,%buf1,%add_2 +triton_red_fused_add_mul_native_layer_norm_0 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 2048, 'r0_': 4096}, + reduction_hint=ReductionHint.INNER, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 50348032}} +) +@triton.jit +def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 2048 + r0_numel = 4096 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = xindex < xnumel + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = xindex + tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) + tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce( + tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0 + ) + tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean) + tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2) + tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight) + tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1) + tmp3 = tmp4[:, None] + tmp7 = tmp5[:, None] + tmp8 = tmp6[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp10 = 1.0 + tmp11 = tmp9 + tmp10 + tmp13 = tmp12.to(tl.float32) + tmp14 = tmp13 - tmp3 + tmp15 = 4096.0 + tmp16 = (tmp7 / tmp15) + tmp17 = 1e-06 + tmp18 = tmp16 + tmp17 + tmp19 = libdevice.rsqrt(tmp18) + tmp20 = tmp14 * tmp19 + tmp21 = tmp20.to(tl.float32) + tmp22 = tmp11 * tmp21 + tmp24 = tmp22 + tmp23 + tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask) +''', device_str='cuda') + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py +# Topologically Sorted Source Nodes: [norm_encoder_hidden_states, add_2, mul_1, norm_encoder_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul] +# Source node to ATen node mapping: +# add_2 => add_4 +# mul_1 => mul_3 +# norm_encoder_hidden_states => add_3, convert_element_type_2, convert_element_type_3, mul_2, rsqrt_1, sub_1, var_mean_1 +# norm_encoder_hidden_states_1 => add_5 +# Graph fragment: +# %arg3_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=arg3_1] +# %arg4_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg4_1] +# %getitem_3 : Tensor "f32[1, 256, 1][256, 1, 256]cuda:0" = PlaceHolder[target=getitem_3] +# %buf4 : Tensor "f32[1, 256, 1][256, 1, 256]cuda:0" = PlaceHolder[target=buf4] +# %arg5_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg5_1] +# %convert_element_type_2 : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg3_1, torch.float32), kwargs = {}) +# %var_mean_1 : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type_2, [2]), kwargs = {correction: 0, keepdim: True}) +# %add_4 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg4_1, 1), kwargs = {}) +# %sub_1 : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type_2, %getitem_3), kwargs = {}) +# %add_3 : Tensor "f32[1, 256, 1][256, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem_2, 1e-06), kwargs = {}) +# %rsqrt_1 : Tensor "f32[1, 256, 1][256, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_3,), kwargs = {}) +# %mul_2 : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_1, %rsqrt_1), kwargs = {}) +# %convert_element_type_3 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_2, torch.bfloat16), kwargs = {}) +# %mul_3 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_4, %convert_element_type_3), kwargs = {}) +# %add_5 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_3, %arg5_1), kwargs = {}) +# return %getitem_3,%buf4,%add_5 +triton_red_fused_add_mul_native_layer_norm_1 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_1', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 256, 'r0_': 4096}, + reduction_hint=ReductionHint.INNER, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 6307840}} +) +@triton.jit +def triton_red_fused_add_mul_native_layer_norm_1(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 256 + r0_numel = 4096 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = xindex < xnumel + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = xindex + tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) + tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce( + tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0 + ) + tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean) + tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2) + tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight) + tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1) + tmp3 = tmp4[:, None] + tmp7 = tmp5[:, None] + tmp8 = tmp6[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp10 = 1.0 + tmp11 = tmp9 + tmp10 + tmp13 = tmp12.to(tl.float32) + tmp14 = tmp13 - tmp3 + tmp15 = 4096.0 + tmp16 = (tmp7 / tmp15) + tmp17 = 1e-06 + tmp18 = tmp16 + tmp17 + tmp19 = libdevice.rsqrt(tmp18) + tmp20 = tmp14 * tmp19 + tmp21 = tmp20.to(tl.float32) + tmp22 = tmp11 * tmp21 + tmp24 = tmp22 + tmp23 + tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 2048, 4096), (8388608, 4096, 1)) + assert_size_stride(arg1_1, (1, 1, 4096), (24576, 24576, 1)) + assert_size_stride(arg2_1, (1, 1, 4096), (24576, 24576, 1)) + assert_size_stride(arg3_1, (1, 256, 4096), (1048576, 4096, 1)) + assert_size_stride(arg4_1, (1, 1, 4096), (24576, 24576, 1)) + assert_size_stride(arg5_1, (1, 1, 4096), (24576, 24576, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf6 = empty_strided_cuda((1, 2048, 4096), (8388608, 4096, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [norm_hidden_states, add, mul, norm_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul] + stream0 = get_raw_stream(0) + triton_red_fused_add_mul_native_layer_norm_0.run(arg0_1, arg1_1, arg2_1, buf6, 2048, 4096, stream=stream0) + del arg0_1 + del arg1_1 + del arg2_1 + buf7 = empty_strided_cuda((1, 256, 4096), (1048576, 4096, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [norm_encoder_hidden_states, add_2, mul_1, norm_encoder_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul] + stream0 = get_raw_stream(0) + triton_red_fused_add_mul_native_layer_norm_1.run(arg3_1, arg4_1, arg5_1, buf7, 256, 4096, stream=stream0) + del arg3_1 + del arg4_1 + del arg5_1 + return (buf6, buf7, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + arg2_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + arg3_1 = rand_strided((1, 256, 4096), (1048576, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + arg4_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + arg5_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py b/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py new file mode 100644 index 0000000000000000000000000000000000000000..4e42f89ff78e62b3218fe7765bb4da20c0715404 --- /dev/null +++ b/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py @@ -0,0 +1,45 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 67108864}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_mul_silu_split_view_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 377487360}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_cat_mul_silu_split_view_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 37748736 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 16384) + x1 = xindex // 16384 + x2 = xindex + tmp0 = x0 + tmp1 = tl.full([1], 0, tl.int64) + tmp2 = tmp0 >= tmp1 + tmp3 = tl.full([1], 4096, tl.int64) + tmp4 = tmp0 < tmp3 + tmp5 = tl.load(in_ptr0 + (4096*x1 + (x0)), tmp4, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp6 = tmp0 >= tmp3 + tmp7 = tl.full([1], 16384, tl.int64) + tmp8 = tmp0 < tmp7 + tmp9 = tl.load(in_ptr1 + (36864*x1 + ((-4096) + x0)), tmp6, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp10 = tmp9.to(tl.float32) + tmp11 = tl.sigmoid(tmp10) + tmp12 = tmp10 * tmp11 + tmp13 = tmp12.to(tl.float32) + tmp14 = tl.load(in_ptr1 + (12288 + 36864*x1 + ((-4096) + x0)), tmp6, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp15 = tmp13 * tmp14 + tmp16 = tl.full(tmp15.shape, 0.0, tmp15.dtype) + tmp17 = tl.where(tmp6, tmp15, tmp16) + tmp18 = tl.where(tmp4, tmp5, tmp17) + tl.store(out_ptr0 + (x2), tmp18, None) diff --git a/torchinductor/3i/cf1587a2fd240ce39177274973308f6fd100d746bf6716a8d96ed4fd12c89d55.best_config b/torchinductor/3i/cf1587a2fd240ce39177274973308f6fd100d746bf6716a8d96ed4fd12c89d55.best_config new file mode 100644 index 0000000000000000000000000000000000000000..a8b7d3123d4d086103b46178b97f09b296ed11b8 --- /dev/null +++ b/torchinductor/3i/cf1587a2fd240ce39177274973308f6fd100d746bf6716a8d96ed4fd12c89d55.best_config @@ -0,0 +1 @@ +{"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 81, "triton_cache_hash": "PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ"} \ No newline at end of file diff --git a/torchinductor/3v/4a00da1b5d4ce251d2cb392c24118fc2e6c3818f25b8457665f0d53e12234277.best_config b/torchinductor/3v/4a00da1b5d4ce251d2cb392c24118fc2e6c3818f25b8457665f0d53e12234277.best_config new file mode 100644 index 0000000000000000000000000000000000000000..67290006b21855bf0166f929dde8fe79bbdc46d9 --- /dev/null +++ b/torchinductor/3v/4a00da1b5d4ce251d2cb392c24118fc2e6c3818f25b8457665f0d53e12234277.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 43, "triton_cache_hash": "SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A"} \ No newline at end of file diff --git a/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py b/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py new file mode 100644 index 0000000000000000000000000000000000000000..65ddf1168e44a098e3e59f5d2bb8c3ca7d867482 --- /dev/null +++ b/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py @@ -0,0 +1,28 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i64', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 37748736}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(in_ptr0, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 128) + x1 = ((xindex // 128) % 2304) + x2 = xindex // 294912 + x3 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + ks0*x1), None).to(tl.float32) + tl.store(out_ptr0 + (x3), tmp0, None) diff --git a/torchinductor/4y/c4ykjyk6fv6enet6mgkj5bsan42tc6rsdfs7aaskpjgv5rzw7tbr.py b/torchinductor/4y/c4ykjyk6fv6enet6mgkj5bsan42tc6rsdfs7aaskpjgv5rzw7tbr.py new file mode 100644 index 0000000000000000000000000000000000000000..2a225ba4fb5bfa2323ec7e00316fe01a313d660c --- /dev/null +++ b/torchinductor/4y/c4ykjyk6fv6enet6mgkj5bsan42tc6rsdfs7aaskpjgv5rzw7tbr.py @@ -0,0 +1,357 @@ +# AOT ID: ['25_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py +# Topologically Sorted Source Nodes: [split, chunk, query_1, query_2, reshape, unbind, key_1, key_2, reshape_1, unbind_1, float_1, cos, mul, neg, stack, x_rotated, float_2, sin, mul_1, add, out, float_3, cos_2, mul_2, neg_1, stack_1, x_rotated_1, float_4, sin_2, mul_3, add_1, out_1], Original ATen: [aten.split_with_sizes, aten.split, aten.view, aten._fused_rms_norm, aten.unbind, aten._to_copy, aten.unsqueeze, aten.mul, aten.neg, aten.stack, aten.add] +# Source node to ATen node mapping: +# add => add_2 +# add_1 => add_3 +# chunk => split +# cos => unsqueeze, unsqueeze_1 +# cos_2 => unsqueeze_6, unsqueeze_7 +# float_1 => convert_element_type_4 +# float_2 => convert_element_type_5 +# float_3 => convert_element_type_7 +# float_4 => convert_element_type_8 +# key_1 => view_1 +# key_2 => add_1, convert_element_type_2, convert_element_type_3, mean_1, mul_2, mul_3, pow_2, rsqrt_1 +# mul => mul_4 +# mul_1 => mul_5 +# mul_2 => mul_6 +# mul_3 => mul_7 +# neg => neg +# neg_1 => neg_1 +# out => convert_element_type_6 +# out_1 => convert_element_type_9 +# query_1 => view +# query_2 => add, convert_element_type, convert_element_type_1, mean, mul, mul_1, pow_1, rsqrt +# reshape => view_3 +# reshape_1 => view_5 +# sin => unsqueeze_2, unsqueeze_3 +# sin_2 => unsqueeze_8, unsqueeze_9 +# split => split_with_sizes +# stack => cat, unsqueeze_4, unsqueeze_5 +# stack_1 => cat_1, unsqueeze_10, unsqueeze_11 +# unbind => unbind +# unbind_1 => unbind_1 +# x_rotated => view_4 +# x_rotated_1 => view_6 +# Graph fragment: +# %arg0_1 : Tensor "bf16[1, 2304, 36864][84934656, 36864, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %buf0 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 73728]cuda:0" = PlaceHolder[target=buf0] +# %arg1_1 : Tensor "bf16[128][1]cuda:0" = PlaceHolder[target=arg1_1] +# %arg3_1 : Tensor "f32[2304, 128][128, 1]cuda:0" = PlaceHolder[target=arg3_1] +# %cat : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0" = PlaceHolder[target=cat] +# %arg4_1 : Tensor "f32[2304, 128][128, 1]cuda:0" = PlaceHolder[target=arg4_1] +# %buf1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 73728]cuda:0" = PlaceHolder[target=buf1] +# %arg2_1 : Tensor "bf16[128][1]cuda:0" = PlaceHolder[target=arg2_1] +# %cat_1 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0" = PlaceHolder[target=cat_1] +# %split_with_sizes : [num_users=2] = call_function[target=torch.ops.aten.split_with_sizes.default](args = (%arg0_1, [12288, 24576], -1), kwargs = {}) +# %split : [num_users=3] = call_function[target=torch.ops.aten.split.Tensor](args = (%getitem, 4096, -1), kwargs = {}) +# %view : Tensor "bf16[1, 2304, 32, 128][84934656, 36864, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%getitem_2, [1, 2304, 32, 128]), kwargs = {}) +# %convert_element_type : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view, torch.float32), kwargs = {}) +# %pow_1 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type, 2), kwargs = {}) +# %mean : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_1, [3], True), kwargs = {}) +# %add : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean, 1e-06), kwargs = {}) +# %rsqrt : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {}) +# %mul : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %rsqrt), kwargs = {}) +# %mul_1 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul, %arg1_1), kwargs = {}) +# %convert_element_type_1 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {}) +# %view_3 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_1, [1, 2304, 32, -1, 2]), kwargs = {}) +# %unbind : [num_users=2] = call_function[target=torch.ops.aten.unbind.int](args = (%view_3, -1), kwargs = {}) +# %view_1 : Tensor "bf16[1, 2304, 32, 128][84934656, 36864, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%getitem_3, [1, 2304, 32, 128]), kwargs = {}) +# %convert_element_type_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_1, torch.float32), kwargs = {}) +# %pow_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_2, 2), kwargs = {}) +# %mean_1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_2, [3], True), kwargs = {}) +# %add_1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_1, 1e-06), kwargs = {}) +# %rsqrt_1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_1,), kwargs = {}) +# %mul_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_2, %rsqrt_1), kwargs = {}) +# %mul_3 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_2, %arg2_1), kwargs = {}) +# %convert_element_type_3 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_3, torch.bfloat16), kwargs = {}) +# %view_5 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_3, [1, 2304, 32, -1, 2]), kwargs = {}) +# %unbind_1 : [num_users=2] = call_function[target=torch.ops.aten.unbind.int](args = (%view_5, -1), kwargs = {}) +# %convert_element_type_4 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%convert_element_type_1, torch.float32), kwargs = {}) +# %unsqueeze : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg3_1, 0), kwargs = {}) +# %unsqueeze_1 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze, 2), kwargs = {}) +# %mul_4 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_4, %unsqueeze_1), kwargs = {}) +# %neg : Tensor "bf16[1, 2304, 32, 64][4718592, 2048, 64, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%getitem_6,), kwargs = {}) +# %unsqueeze_4 : Tensor "bf16[1, 2304, 32, 64, 1][4718592, 2048, 64, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%neg, 4), kwargs = {}) +# %unsqueeze_5 : Tensor "bf16[1, 2304, 32, 64, 1][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%getitem_5, 4), kwargs = {}) +# %cat : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_4, %unsqueeze_5], -1), kwargs = {}) +# %view_4 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat, [1, 2304, 32, 128]), kwargs = {}) +# %convert_element_type_5 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_4, torch.float32), kwargs = {}) +# %unsqueeze_2 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg4_1, 0), kwargs = {}) +# %unsqueeze_3 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_2, 2), kwargs = {}) +# %mul_5 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_5, %unsqueeze_3), kwargs = {}) +# %add_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_4, %mul_5), kwargs = {}) +# %convert_element_type_6 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_2, torch.bfloat16), kwargs = {}) +# %convert_element_type_7 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%convert_element_type_3, torch.float32), kwargs = {}) +# %unsqueeze_6 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg3_1, 0), kwargs = {}) +# %unsqueeze_7 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_6, 2), kwargs = {}) +# %mul_6 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_7, %unsqueeze_7), kwargs = {}) +# %neg_1 : Tensor "bf16[1, 2304, 32, 64][4718592, 2048, 64, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%getitem_8,), kwargs = {}) +# %unsqueeze_10 : Tensor "bf16[1, 2304, 32, 64, 1][4718592, 2048, 64, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%neg_1, 4), kwargs = {}) +# %unsqueeze_11 : Tensor "bf16[1, 2304, 32, 64, 1][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%getitem_7, 4), kwargs = {}) +# %cat_1 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_10, %unsqueeze_11], -1), kwargs = {}) +# %view_6 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_1, [1, 2304, 32, 128]), kwargs = {}) +# %convert_element_type_8 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_6, torch.float32), kwargs = {}) +# %unsqueeze_8 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg4_1, 0), kwargs = {}) +# %unsqueeze_9 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_8, 2), kwargs = {}) +# %mul_7 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_8, %unsqueeze_9), kwargs = {}) +# %add_3 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_6, %mul_7), kwargs = {}) +# %convert_element_type_9 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_3, torch.bfloat16), kwargs = {}) +# return %buf1,%buf0,%cat,%convert_element_type_6,%cat_1,%convert_element_type_9 +triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 = async_compile.triton('triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 131072, 'r0_': 128}, + reduction_hint=ReductionHint.DEFAULT, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_out_ptr1': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0', 'mutated_arg_names': ['in_out_ptr0', 'in_out_ptr1'], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 16, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 0, 'r0_': 115606016}} +) +@triton.jit +def triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(in_out_ptr0, in_out_ptr1, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 73728 + r0_numel = 128 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = (xindex % 32) + x1 = xindex // 32 + _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) + x5 = xindex + _tmp10 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_2 = r0_index + tmp0 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp6 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tmp1 * tmp1 + tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) + tmp5 = _tmp4 + tmp3 + _tmp4 = tl.where(r0_mask, tmp5, _tmp4) + tmp7 = tmp6.to(tl.float32) + tmp8 = tmp7 * tmp7 + tmp9 = tl.broadcast_to(tmp8, [XBLOCK, R0_BLOCK]) + tmp11 = _tmp10 + tmp9 + _tmp10 = tl.where(r0_mask, tmp11, _tmp10) + tmp4 = tl.sum(_tmp4, 1)[:, None] + tmp10 = tl.sum(_tmp10, 1)[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_3 = (r0_index % 2) + r0_4 = r0_index // 2 + r0_2 = r0_index + tmp50 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp58 = tl.load(in_ptr1 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp63 = tl.load(in_ptr2 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0) + tmp66 = tl.load(in_ptr3 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0) + tmp96 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp102 = tl.load(in_ptr4 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp12 = r0_3 + tmp13 = tl.full([1, 1], 0, tl.int64) + tmp14 = tmp12 >= tmp13 + tmp15 = tl.full([1, 1], 1, tl.int64) + tmp16 = tmp12 < tmp15 + tmp17 = tl.load(in_ptr0 + (1 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp18 = tmp17.to(tl.float32) + tmp19 = 128.0 + tmp20 = (tmp10 / tmp19) + tmp21 = 1e-06 + tmp22 = tmp20 + tmp21 + tmp23 = libdevice.rsqrt(tmp22) + tmp24 = tmp18 * tmp23 + tmp25 = tl.load(in_ptr1 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp26 = tmp25.to(tl.float32) + tmp27 = tmp24 * tmp26 + tmp28 = tmp27.to(tl.float32) + tmp29 = -tmp28 + tmp30 = tl.full(tmp29.shape, 0.0, tmp29.dtype) + tmp31 = tl.where(tmp16, tmp29, tmp30) + tmp32 = tmp12 >= tmp15 + tmp33 = tl.full([1, 1], 2, tl.int64) + tmp34 = tmp12 < tmp33 + tmp35 = tl.load(in_ptr0 + (2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp36 = tmp35.to(tl.float32) + tmp37 = 128.0 + tmp38 = (tmp10 / tmp37) + tmp39 = 1e-06 + tmp40 = tmp38 + tmp39 + tmp41 = libdevice.rsqrt(tmp40) + tmp42 = tmp36 * tmp41 + tmp43 = tl.load(in_ptr1 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp44 = tmp43.to(tl.float32) + tmp45 = tmp42 * tmp44 + tmp46 = tmp45.to(tl.float32) + tmp47 = tl.full(tmp46.shape, 0.0, tmp46.dtype) + tmp48 = tl.where(tmp32, tmp46, tmp47) + tmp49 = tl.where(tmp16, tmp31, tmp48) + tmp51 = tmp50.to(tl.float32) + tmp52 = 128.0 + tmp53 = (tmp10 / tmp52) + tmp54 = 1e-06 + tmp55 = tmp53 + tmp54 + tmp56 = libdevice.rsqrt(tmp55) + tmp57 = tmp51 * tmp56 + tmp59 = tmp58.to(tl.float32) + tmp60 = tmp57 * tmp59 + tmp61 = tmp60.to(tl.float32) + tmp62 = tmp61.to(tl.float32) + tmp64 = tmp62 * tmp63 + tmp65 = tmp49.to(tl.float32) + tmp67 = tmp65 * tmp66 + tmp68 = tmp64 + tmp67 + tmp69 = tmp68.to(tl.float32) + tmp70 = tl.load(in_ptr0 + (4097 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp71 = tmp70.to(tl.float32) + tmp72 = (tmp4 / tmp19) + tmp73 = tmp72 + tmp21 + tmp74 = libdevice.rsqrt(tmp73) + tmp75 = tmp71 * tmp74 + tmp76 = tl.load(in_ptr4 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp77 = tmp76.to(tl.float32) + tmp78 = tmp75 * tmp77 + tmp79 = tmp78.to(tl.float32) + tmp80 = -tmp79 + tmp81 = tl.full(tmp80.shape, 0.0, tmp80.dtype) + tmp82 = tl.where(tmp16, tmp80, tmp81) + tmp83 = tl.load(in_ptr0 + (4096 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp84 = tmp83.to(tl.float32) + tmp85 = (tmp4 / tmp37) + tmp86 = tmp85 + tmp39 + tmp87 = libdevice.rsqrt(tmp86) + tmp88 = tmp84 * tmp87 + tmp89 = tl.load(in_ptr4 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp90 = tmp89.to(tl.float32) + tmp91 = tmp88 * tmp90 + tmp92 = tmp91.to(tl.float32) + tmp93 = tl.full(tmp92.shape, 0.0, tmp92.dtype) + tmp94 = tl.where(tmp32, tmp92, tmp93) + tmp95 = tl.where(tmp16, tmp82, tmp94) + tmp97 = tmp96.to(tl.float32) + tmp98 = (tmp4 / tmp52) + tmp99 = tmp98 + tmp54 + tmp100 = libdevice.rsqrt(tmp99) + tmp101 = tmp97 * tmp100 + tmp103 = tmp102.to(tl.float32) + tmp104 = tmp101 * tmp103 + tmp105 = tmp104.to(tl.float32) + tmp106 = tmp105.to(tl.float32) + tmp107 = tmp106 * tmp63 + tmp108 = tmp95.to(tl.float32) + tmp109 = tmp108 * tmp66 + tmp110 = tmp107 + tmp109 + tmp111 = tmp110.to(tl.float32) + tl.store(in_out_ptr0 + (r0_2 + 128*x5), tmp69, r0_mask) + tl.store(in_out_ptr1 + (r0_2 + 128*x5), tmp111, r0_mask) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1, arg2_1, arg3_1, arg4_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 2304, 36864), (84934656, 36864, 1)) + assert_size_stride(arg1_1, (128, ), (1, )) + assert_size_stride(arg2_1, (128, ), (1, )) + assert_size_stride(arg3_1, (2304, 128), (128, 1)) + assert_size_stride(arg4_1, (2304, 128), (128, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf2 = empty_strided_cuda((1, 2304, 32, 64, 2), (9437184, 4096, 128, 2, 1), torch.bfloat16) + buf3 = reinterpret_tensor(buf2, (1, 2304, 32, 128), (9437184, 4096, 128, 1), 0); del buf2 # reuse + buf4 = empty_strided_cuda((1, 2304, 32, 64, 2), (9437184, 4096, 128, 2, 1), torch.bfloat16) + buf5 = reinterpret_tensor(buf4, (1, 2304, 32, 128), (9437184, 4096, 128, 1), 0); del buf4 # reuse + # Topologically Sorted Source Nodes: [split, chunk, query_1, query_2, reshape, unbind, key_1, key_2, reshape_1, unbind_1, float_1, cos, mul, neg, stack, x_rotated, float_2, sin, mul_1, add, out, float_3, cos_2, mul_2, neg_1, stack_1, x_rotated_1, float_4, sin_2, mul_3, add_1, out_1], Original ATen: [aten.split_with_sizes, aten.split, aten.view, aten._fused_rms_norm, aten.unbind, aten._to_copy, aten.unsqueeze, aten.mul, aten.neg, aten.stack, aten.add] + stream0 = get_raw_stream(0) + triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.run(buf3, buf5, arg0_1, arg1_1, arg3_1, arg4_1, arg2_1, 73728, 128, stream=stream0) + del arg1_1 + del arg2_1 + del arg3_1 + del arg4_1 + return (buf3, buf5, reinterpret_tensor(arg0_1, (1, 2304, 32, 128), (84934656, 36864, 128, 1), 8192), reinterpret_tensor(arg0_1, (1, 2304, 24576), (84934656, 36864, 1), 12288), ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 2304, 36864), (84934656, 36864, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16) + arg2_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16) + arg3_1 = rand_strided((2304, 128), (128, 1), device='cuda:0', dtype=torch.float32) + arg4_1 = rand_strided((2304, 128), (128, 1), device='cuda:0', dtype=torch.float32) + fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/6k/abd9e26dfce6bf628201c09f1f90f4340fdaab3cc2dd99f7186afe82fe013d1a.best_config b/torchinductor/6k/abd9e26dfce6bf628201c09f1f90f4340fdaab3cc2dd99f7186afe82fe013d1a.best_config new file mode 100644 index 0000000000000000000000000000000000000000..971d380017dc765c34827c697ce9956bb1e05fb0 --- /dev/null +++ b/torchinductor/6k/abd9e26dfce6bf628201c09f1f90f4340fdaab3cc2dd99f7186afe82fe013d1a.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 35, "triton_cache_hash": "Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q"} \ No newline at end of file diff --git a/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py b/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py new file mode 100644 index 0000000000000000000000000000000000000000..04685fb8a7084ed414e55edb2df1b2916f279909 --- /dev/null +++ b/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py @@ -0,0 +1,30 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 1048576}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 8396800}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_add_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 1048576 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x2 = xindex + x0 = (xindex % 4096) + tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) + tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32) + tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = tmp0 + tmp3 + tl.store(out_ptr0 + (x2), tmp4, None) diff --git a/torchinductor/6w/4fb0f9adeff50e9452e8fd238a1808052c095c59a0b2f1d9f3f7d7106bd1ede5.best_config b/torchinductor/6w/4fb0f9adeff50e9452e8fd238a1808052c095c59a0b2f1d9f3f7d7106bd1ede5.best_config new file mode 100644 index 0000000000000000000000000000000000000000..b70899f8b8985c3a02e12ab5d4b5a15f2ca15b04 --- /dev/null +++ b/torchinductor/6w/4fb0f9adeff50e9452e8fd238a1808052c095c59a0b2f1d9f3f7d7106bd1ede5.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 68, "triton_cache_hash": "6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA"} \ No newline at end of file diff --git a/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py b/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py new file mode 100644 index 0000000000000000000000000000000000000000..cad5a337f915a1c559798a385ba04a0072936100 --- /dev/null +++ b/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py @@ -0,0 +1,33 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 33554432}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_split_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 201326592}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_mul_silu_split_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 25165824 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 12288) + x1 = xindex // 12288 + x2 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 24576*x1), None).to(tl.float32) + tmp5 = tl.load(in_ptr0 + (12288 + x0 + 24576*x1), None).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tl.sigmoid(tmp1) + tmp3 = tmp1 * tmp2 + tmp4 = tmp3.to(tl.float32) + tmp6 = tmp4 * tmp5 + tl.store(out_ptr0 + (x2), tmp6, None) diff --git a/torchinductor/7f/be95397d0c18f43f4314e0cac66d456d9d3e2b12116963a4bf988016e97f7a5e.best_config b/torchinductor/7f/be95397d0c18f43f4314e0cac66d456d9d3e2b12116963a4bf988016e97f7a5e.best_config new file mode 100644 index 0000000000000000000000000000000000000000..d376bfab6dd203c3be2ed98dc83eac3fdac99cd4 --- /dev/null +++ b/torchinductor/7f/be95397d0c18f43f4314e0cac66d456d9d3e2b12116963a4bf988016e97f7a5e.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 45, "triton_cache_hash": "EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ"} \ No newline at end of file diff --git a/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py b/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py new file mode 100644 index 0000000000000000000000000000000000000000..a55f1d57e8e29166b759b911bd95c270d13cace7 --- /dev/null +++ b/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py @@ -0,0 +1,30 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 8388608}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 67117056}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_add_mul_1(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 8388608 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x2 = xindex + x0 = (xindex % 4096) + tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) + tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32) + tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = tmp0 + tmp3 + tl.store(out_ptr0 + (x2), tmp4, None) diff --git a/torchinductor/a3/94dc88253134d772dc28ed260760d9a0059b054d472700be3c22dd06b228f22f.best_config b/torchinductor/a3/94dc88253134d772dc28ed260760d9a0059b054d472700be3c22dd06b228f22f.best_config new file mode 100644 index 0000000000000000000000000000000000000000..47c3098c0f7e55d5446b3abf3c618707c676e6b9 --- /dev/null +++ b/torchinductor/a3/94dc88253134d772dc28ed260760d9a0059b054d472700be3c22dd06b228f22f.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1, "R0_BLOCK": 2048, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 35, "triton_cache_hash": "H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ"} \ No newline at end of file diff --git a/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py b/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py new file mode 100644 index 0000000000000000000000000000000000000000..f16537278f5cbb5b38ec2e7a1dfef3c34c40ef4c --- /dev/null +++ b/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py @@ -0,0 +1,78 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 256, 'r0_': 4096}, + reduction_hint=ReductionHint.INNER, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*bf16', 'out_ptr3': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 12607488}} +) +@triton.jit +def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 256 + r0_numel = 4096 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = xindex < xnumel + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = xindex + tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp1 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp2 = tl.load(in_ptr2 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = tmp0 + tmp3 + tmp5 = tmp4.to(tl.float32) + tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK]) + tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce( + tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0 + ) + tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean) + tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2) + tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight) + tl.store(out_ptr0 + (r0_1 + 4096*x0), tmp4, r0_mask & xmask) + tmp8, tmp9, tmp10 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1) + tmp7 = tmp8[:, None] + tmp11 = tmp9[:, None] + tmp12 = tmp10[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp13 = tl.load(out_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp23 = tl.load(in_ptr3 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp27 = tl.load(in_ptr4 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp14 = tmp13.to(tl.float32) + tmp15 = tmp14 - tmp7 + tmp16 = 4096.0 + tmp17 = (tmp11 / tmp16) + tmp18 = 1e-06 + tmp19 = tmp17 + tmp18 + tmp20 = libdevice.rsqrt(tmp19) + tmp21 = tmp15 * tmp20 + tmp22 = tmp21.to(tl.float32) + tmp24 = 1.0 + tmp25 = tmp23 + tmp24 + tmp26 = tmp22 * tmp25 + tmp28 = tmp26 + tmp27 + tl.store(out_ptr3 + (r0_1 + 4096*x0), tmp28, r0_mask & xmask) diff --git a/torchinductor/aotautograd/a27rkqg32yfaub3aygtms2gl3oet2qxfcnp4zxa3zy5h6c3risxz/aw5eda3h36wpnnltujgkb4mvobznersd4fuvo2p7vy2quujasos b/torchinductor/aotautograd/a27rkqg32yfaub3aygtms2gl3oet2qxfcnp4zxa3zy5h6c3risxz/aw5eda3h36wpnnltujgkb4mvobznersd4fuvo2p7vy2quujasos new file mode 100644 index 0000000000000000000000000000000000000000..a75c75963a7dd128def3a7cc44c38642f52132b6 Binary files /dev/null and b/torchinductor/aotautograd/a27rkqg32yfaub3aygtms2gl3oet2qxfcnp4zxa3zy5h6c3risxz/aw5eda3h36wpnnltujgkb4mvobznersd4fuvo2p7vy2quujasos differ diff --git a/torchinductor/aotautograd/a3443o3ywoehrda4trn5q47mauudwcinftvd52hitdnfmakyhqc4/lw6yvpbd45y77sg6fh5v4otbinchkwbf7b56u3rh3wgq3x2wkhq b/torchinductor/aotautograd/a3443o3ywoehrda4trn5q47mauudwcinftvd52hitdnfmakyhqc4/lw6yvpbd45y77sg6fh5v4otbinchkwbf7b56u3rh3wgq3x2wkhq new file mode 100644 index 0000000000000000000000000000000000000000..85f549011b728e4dd378ceb11df78ff9df78b4f2 Binary files /dev/null and b/torchinductor/aotautograd/a3443o3ywoehrda4trn5q47mauudwcinftvd52hitdnfmakyhqc4/lw6yvpbd45y77sg6fh5v4otbinchkwbf7b56u3rh3wgq3x2wkhq differ diff --git a/torchinductor/aotautograd/a3554ihbxq57jan4ib74iqo5mnaqevqume4yzewukzkm6ehpsilz/eubahghkef62rmchvnle5v6h3ddip4av5qqjxomdlm7ura45qve b/torchinductor/aotautograd/a3554ihbxq57jan4ib74iqo5mnaqevqume4yzewukzkm6ehpsilz/eubahghkef62rmchvnle5v6h3ddip4av5qqjxomdlm7ura45qve new file mode 100644 index 0000000000000000000000000000000000000000..b344ce096a4759a5fc1c7c9751de14002b25b079 Binary files /dev/null and b/torchinductor/aotautograd/a3554ihbxq57jan4ib74iqo5mnaqevqume4yzewukzkm6ehpsilz/eubahghkef62rmchvnle5v6h3ddip4av5qqjxomdlm7ura45qve differ diff --git a/torchinductor/aotautograd/a3hojixb5fzn7f7jfco3ddoohdsuggk4qbop3lcg7rjy3e7fkgfz/o7wvolbgborwtoofbovayor23y4ubooymfcvv6jeqm2wbx3n2cs b/torchinductor/aotautograd/a3hojixb5fzn7f7jfco3ddoohdsuggk4qbop3lcg7rjy3e7fkgfz/o7wvolbgborwtoofbovayor23y4ubooymfcvv6jeqm2wbx3n2cs new file mode 100644 index 0000000000000000000000000000000000000000..c25ad76a5497cbb250d6f704688a1c333ae61aaf Binary files /dev/null and b/torchinductor/aotautograd/a3hojixb5fzn7f7jfco3ddoohdsuggk4qbop3lcg7rjy3e7fkgfz/o7wvolbgborwtoofbovayor23y4ubooymfcvv6jeqm2wbx3n2cs differ diff --git a/torchinductor/aotautograd/a54twb2qknddjxnxtmkoagy3umo5y3ptsesm2pdhy7nkefklf6wx/emxzj524wmpvifsxw4dsnnkzemqpzfgkenbo5obwmksvlhsr354 b/torchinductor/aotautograd/a54twb2qknddjxnxtmkoagy3umo5y3ptsesm2pdhy7nkefklf6wx/emxzj524wmpvifsxw4dsnnkzemqpzfgkenbo5obwmksvlhsr354 new file mode 100644 index 0000000000000000000000000000000000000000..22b825fa8a9a866f1ef9dd4c024a3c0d4c2f99a3 Binary files /dev/null and b/torchinductor/aotautograd/a54twb2qknddjxnxtmkoagy3umo5y3ptsesm2pdhy7nkefklf6wx/emxzj524wmpvifsxw4dsnnkzemqpzfgkenbo5obwmksvlhsr354 differ diff --git a/torchinductor/aotautograd/a5ksywxhfabbequvxwstheyyj5w3sinuubxcrypqjwbqsyw5la3l/ew4fxjyfoflznyuws2w2ylu4p7owpjuqoshsef75w43w2vvwejd b/torchinductor/aotautograd/a5ksywxhfabbequvxwstheyyj5w3sinuubxcrypqjwbqsyw5la3l/ew4fxjyfoflznyuws2w2ylu4p7owpjuqoshsef75w43w2vvwejd new file mode 100644 index 0000000000000000000000000000000000000000..0cbab3e3bda3aaf0f712cadbf3bee91f8624c693 Binary files /dev/null and b/torchinductor/aotautograd/a5ksywxhfabbequvxwstheyyj5w3sinuubxcrypqjwbqsyw5la3l/ew4fxjyfoflznyuws2w2ylu4p7owpjuqoshsef75w43w2vvwejd differ diff --git a/torchinductor/aotautograd/a7ptufzlocphh5n5o5u63gfzkf74tjb3l5is45u5hqjspv32qda6/an4kgppgf4vt5yfvvghrnmho6jc3qnj4l6c75zrsiotr5d4u5gv b/torchinductor/aotautograd/a7ptufzlocphh5n5o5u63gfzkf74tjb3l5is45u5hqjspv32qda6/an4kgppgf4vt5yfvvghrnmho6jc3qnj4l6c75zrsiotr5d4u5gv new file mode 100644 index 0000000000000000000000000000000000000000..8cb686adafd1e0881524b4544c1117acc700794d Binary files /dev/null and b/torchinductor/aotautograd/a7ptufzlocphh5n5o5u63gfzkf74tjb3l5is45u5hqjspv32qda6/an4kgppgf4vt5yfvvghrnmho6jc3qnj4l6c75zrsiotr5d4u5gv differ diff --git a/torchinductor/aotautograd/aal6kceyfi7eazavxzpgcec5hzt32bkwo7p4doeyc56ubzlwuvx4/nkoni3ckgbheucucq64bmrta4lhz7x237lalaqcrejvdc3supg4 b/torchinductor/aotautograd/aal6kceyfi7eazavxzpgcec5hzt32bkwo7p4doeyc56ubzlwuvx4/nkoni3ckgbheucucq64bmrta4lhz7x237lalaqcrejvdc3supg4 new file mode 100644 index 0000000000000000000000000000000000000000..0a851d7123d5e434f42c8fb9c164ee8c4dcb33ac Binary files /dev/null and b/torchinductor/aotautograd/aal6kceyfi7eazavxzpgcec5hzt32bkwo7p4doeyc56ubzlwuvx4/nkoni3ckgbheucucq64bmrta4lhz7x237lalaqcrejvdc3supg4 differ diff --git a/torchinductor/aotautograd/aan5kpy6i54rnpeu5vlzbx6i6blimsvhducl7futzdjr4xciy472/a35s4usnkzmh6ybhedo3b6zehfepmwdv2gxscayjeeuucr3zat7 b/torchinductor/aotautograd/aan5kpy6i54rnpeu5vlzbx6i6blimsvhducl7futzdjr4xciy472/a35s4usnkzmh6ybhedo3b6zehfepmwdv2gxscayjeeuucr3zat7 new file mode 100644 index 0000000000000000000000000000000000000000..a875cbae71a6435797a6678651cad0a75d4a4615 Binary files /dev/null and b/torchinductor/aotautograd/aan5kpy6i54rnpeu5vlzbx6i6blimsvhducl7futzdjr4xciy472/a35s4usnkzmh6ybhedo3b6zehfepmwdv2gxscayjeeuucr3zat7 differ diff --git a/torchinductor/aotautograd/aesonb7djseswkbtu2qzhvg6ikd5rewxnqlt6pwuytadpxxmjcod/lap2sypphhofd6d5rhojruk2vfyvw2olc7gtulmom4i5y7ix2cp b/torchinductor/aotautograd/aesonb7djseswkbtu2qzhvg6ikd5rewxnqlt6pwuytadpxxmjcod/lap2sypphhofd6d5rhojruk2vfyvw2olc7gtulmom4i5y7ix2cp new file mode 100644 index 0000000000000000000000000000000000000000..c8d86999d1eed7e32dbdbe87172016ef36de3d11 Binary files /dev/null and b/torchinductor/aotautograd/aesonb7djseswkbtu2qzhvg6ikd5rewxnqlt6pwuytadpxxmjcod/lap2sypphhofd6d5rhojruk2vfyvw2olc7gtulmom4i5y7ix2cp differ diff --git a/torchinductor/aotautograd/age65c4dyk2rxcqufpxd6bsafzao7tacrsvejbf3pjbsngnoashv/upzttal3jaj233iyzyps7mjpq75jt6qi6rzramvgyyewfg76h6s b/torchinductor/aotautograd/age65c4dyk2rxcqufpxd6bsafzao7tacrsvejbf3pjbsngnoashv/upzttal3jaj233iyzyps7mjpq75jt6qi6rzramvgyyewfg76h6s new file mode 100644 index 0000000000000000000000000000000000000000..c4d65e6647fc42a8a82752e219d65675a3bc294c Binary files /dev/null and b/torchinductor/aotautograd/age65c4dyk2rxcqufpxd6bsafzao7tacrsvejbf3pjbsngnoashv/upzttal3jaj233iyzyps7mjpq75jt6qi6rzramvgyyewfg76h6s differ diff --git a/torchinductor/aotautograd/ahkpwjcp2qqyj6wu2ckjqlrit2pbb3ig3ddi75hgbkgngvvipwyq/ha76p7wv3nimmrgvx6kdiqikd6adbw7nlnaiars5ey4anx46mwn b/torchinductor/aotautograd/ahkpwjcp2qqyj6wu2ckjqlrit2pbb3ig3ddi75hgbkgngvvipwyq/ha76p7wv3nimmrgvx6kdiqikd6adbw7nlnaiars5ey4anx46mwn new file mode 100644 index 0000000000000000000000000000000000000000..920ab76a22529e945f9034f1743b80ab496da06f Binary files /dev/null and b/torchinductor/aotautograd/ahkpwjcp2qqyj6wu2ckjqlrit2pbb3ig3ddi75hgbkgngvvipwyq/ha76p7wv3nimmrgvx6kdiqikd6adbw7nlnaiars5ey4anx46mwn differ diff --git a/torchinductor/aotautograd/aiojzczi5txclvaydkrk5g3qlf33pdkkhxtefkhfphkpc3o6rr4p/w3n37k3qhqfhuewneurnairyblp3h7nrak6oyp2p3um7uwnfcz5 b/torchinductor/aotautograd/aiojzczi5txclvaydkrk5g3qlf33pdkkhxtefkhfphkpc3o6rr4p/w3n37k3qhqfhuewneurnairyblp3h7nrak6oyp2p3um7uwnfcz5 new file mode 100644 index 0000000000000000000000000000000000000000..31443751b6c080f9631942333d1ddbdce5c45484 Binary files /dev/null and b/torchinductor/aotautograd/aiojzczi5txclvaydkrk5g3qlf33pdkkhxtefkhfphkpc3o6rr4p/w3n37k3qhqfhuewneurnairyblp3h7nrak6oyp2p3um7uwnfcz5 differ diff --git a/torchinductor/aotautograd/ajdkg3gacw25klanvqotc3mkab3mi23jtjpagxrosdmqv3d4yg7v/ejzrqbsrchqzxfppkzo4ep7edhv7lrjjbcdxkxvodbk4vvk3b62 b/torchinductor/aotautograd/ajdkg3gacw25klanvqotc3mkab3mi23jtjpagxrosdmqv3d4yg7v/ejzrqbsrchqzxfppkzo4ep7edhv7lrjjbcdxkxvodbk4vvk3b62 new file mode 100644 index 0000000000000000000000000000000000000000..887f90362faa59ade6fd988b632f6d95c7b035e9 Binary files /dev/null and b/torchinductor/aotautograd/ajdkg3gacw25klanvqotc3mkab3mi23jtjpagxrosdmqv3d4yg7v/ejzrqbsrchqzxfppkzo4ep7edhv7lrjjbcdxkxvodbk4vvk3b62 differ diff --git a/torchinductor/aotautograd/amb262dx57ptj6gg2ch6skr372w6arsr3i7i4ed5pljhiycuxduw/fntav2w4z5lvr443jxseqalau2vuzp7x7ljd3hanoqubtutjkvp b/torchinductor/aotautograd/amb262dx57ptj6gg2ch6skr372w6arsr3i7i4ed5pljhiycuxduw/fntav2w4z5lvr443jxseqalau2vuzp7x7ljd3hanoqubtutjkvp new file mode 100644 index 0000000000000000000000000000000000000000..f4347151aed10d5cb40f2b46b2695063148b8e89 Binary files /dev/null and b/torchinductor/aotautograd/amb262dx57ptj6gg2ch6skr372w6arsr3i7i4ed5pljhiycuxduw/fntav2w4z5lvr443jxseqalau2vuzp7x7ljd3hanoqubtutjkvp differ diff --git a/torchinductor/aotautograd/amjjivi2p6firai3idkjgfxyy6z4prevujsjdno2uuchwvd7xqll/enc6ruqcyggs4mnt54tjdd2lvexcvipd5vhhamxwcj77g5fpyof b/torchinductor/aotautograd/amjjivi2p6firai3idkjgfxyy6z4prevujsjdno2uuchwvd7xqll/enc6ruqcyggs4mnt54tjdd2lvexcvipd5vhhamxwcj77g5fpyof new file mode 100644 index 0000000000000000000000000000000000000000..cc09fbd335287f63644161ab26d5d9b11c1aaf44 Binary files /dev/null and b/torchinductor/aotautograd/amjjivi2p6firai3idkjgfxyy6z4prevujsjdno2uuchwvd7xqll/enc6ruqcyggs4mnt54tjdd2lvexcvipd5vhhamxwcj77g5fpyof differ diff --git a/torchinductor/aotautograd/apfaqlwe555qd2zoz575w5mvoxoiasmcomkv76mhz5zvnm5jok66/epmli5r46rzrqf73pqrnb5tratdg3mbbwdf5vyzqr6ejyhnooye b/torchinductor/aotautograd/apfaqlwe555qd2zoz575w5mvoxoiasmcomkv76mhz5zvnm5jok66/epmli5r46rzrqf73pqrnb5tratdg3mbbwdf5vyzqr6ejyhnooye new file mode 100644 index 0000000000000000000000000000000000000000..d17c574375648b55e711b6e3f3c095ccebe5e310 Binary files /dev/null and b/torchinductor/aotautograd/apfaqlwe555qd2zoz575w5mvoxoiasmcomkv76mhz5zvnm5jok66/epmli5r46rzrqf73pqrnb5tratdg3mbbwdf5vyzqr6ejyhnooye differ diff --git a/torchinductor/aotautograd/asjbg7f735jw54kcldmvv5uost22wzpy3hkxgaihos4rllvagheu/lwqpsnp52rszp2nlwkgi33embno5st2u5bxfm4rpyoy6fql5aor b/torchinductor/aotautograd/asjbg7f735jw54kcldmvv5uost22wzpy3hkxgaihos4rllvagheu/lwqpsnp52rszp2nlwkgi33embno5st2u5bxfm4rpyoy6fql5aor new file mode 100644 index 0000000000000000000000000000000000000000..b0703d77e52c8559c1d465e355c9224829b81166 Binary files /dev/null and b/torchinductor/aotautograd/asjbg7f735jw54kcldmvv5uost22wzpy3hkxgaihos4rllvagheu/lwqpsnp52rszp2nlwkgi33embno5st2u5bxfm4rpyoy6fql5aor differ diff --git a/torchinductor/aotautograd/atc2ggqhejcse5aydwh2wjakijsc2dyhqjxwdqrwpra3mgjwe4st/xwy7lzraqocjillvk4s2yc2qhpkx43s2nbkxmeb2wpph3sgyc7n b/torchinductor/aotautograd/atc2ggqhejcse5aydwh2wjakijsc2dyhqjxwdqrwpra3mgjwe4st/xwy7lzraqocjillvk4s2yc2qhpkx43s2nbkxmeb2wpph3sgyc7n new file mode 100644 index 0000000000000000000000000000000000000000..a8c5090c898a09145b0b84a61f8ba426a01716fe Binary files /dev/null and b/torchinductor/aotautograd/atc2ggqhejcse5aydwh2wjakijsc2dyhqjxwdqrwpra3mgjwe4st/xwy7lzraqocjillvk4s2yc2qhpkx43s2nbkxmeb2wpph3sgyc7n differ diff --git a/torchinductor/aotautograd/atsevoi6zqdcnehuxassvjosi3j5vrk54uisibylfgspeewp6vyx/4sfzv7d6ch2yoi6nnr5ym3i6yibku3vfveyrr6sx6dqbmavxo32 b/torchinductor/aotautograd/atsevoi6zqdcnehuxassvjosi3j5vrk54uisibylfgspeewp6vyx/4sfzv7d6ch2yoi6nnr5ym3i6yibku3vfveyrr6sx6dqbmavxo32 new file mode 100644 index 0000000000000000000000000000000000000000..e6289bfe09c5a26fff81a4e57e466a9bd84c53fb Binary files /dev/null and b/torchinductor/aotautograd/atsevoi6zqdcnehuxassvjosi3j5vrk54uisibylfgspeewp6vyx/4sfzv7d6ch2yoi6nnr5ym3i6yibku3vfveyrr6sx6dqbmavxo32 differ diff --git a/torchinductor/aotautograd/ax7bbwqbruobasu7vagn2oj2owh5vgosxbjelta324rvf4tkesd4/ipnutob47ydixp2zetluyw4apg7fe5sfkkiianwaawh6yq3uang b/torchinductor/aotautograd/ax7bbwqbruobasu7vagn2oj2owh5vgosxbjelta324rvf4tkesd4/ipnutob47ydixp2zetluyw4apg7fe5sfkkiianwaawh6yq3uang new file mode 100644 index 0000000000000000000000000000000000000000..0b86a5c999c016f475cf83332f02f65c0f0ea13b Binary files /dev/null and b/torchinductor/aotautograd/ax7bbwqbruobasu7vagn2oj2owh5vgosxbjelta324rvf4tkesd4/ipnutob47ydixp2zetluyw4apg7fe5sfkkiianwaawh6yq3uang differ diff --git a/torchinductor/aotautograd/ay26zyuzpll2prvy7zzoeydo7r47lrr6s6jcmzi2zmytjxzebmnz/nzx7lukg3r25p6sjlwtqmkf6gmgzuq7iwagwki2x4kvhw5ducr5 b/torchinductor/aotautograd/ay26zyuzpll2prvy7zzoeydo7r47lrr6s6jcmzi2zmytjxzebmnz/nzx7lukg3r25p6sjlwtqmkf6gmgzuq7iwagwki2x4kvhw5ducr5 new file mode 100644 index 0000000000000000000000000000000000000000..cba52096030d21d51f66a850a98a6e47f8d5a7b4 Binary files /dev/null and b/torchinductor/aotautograd/ay26zyuzpll2prvy7zzoeydo7r47lrr6s6jcmzi2zmytjxzebmnz/nzx7lukg3r25p6sjlwtqmkf6gmgzuq7iwagwki2x4kvhw5ducr5 differ diff --git a/torchinductor/aotautograd/ay65riayezoo7bqggl72pzrzdi6lvy5mp23ajx4f453ylzpmve3s/p7clvcke3bsgsaumutstrxc7bkq4tq6yoia7nwigana3n3unini b/torchinductor/aotautograd/ay65riayezoo7bqggl72pzrzdi6lvy5mp23ajx4f453ylzpmve3s/p7clvcke3bsgsaumutstrxc7bkq4tq6yoia7nwigana3n3unini new file mode 100644 index 0000000000000000000000000000000000000000..9f2a4422836d443257787e673b5b9181c112db72 Binary files /dev/null and b/torchinductor/aotautograd/ay65riayezoo7bqggl72pzrzdi6lvy5mp23ajx4f453ylzpmve3s/p7clvcke3bsgsaumutstrxc7bkq4tq6yoia7nwigana3n3unini differ diff --git a/torchinductor/aotautograd/azyih32olvhzuay5zpfypzhk2cdlosvaqxdhcnjzlwfs6k3a2ne6/5sz2kjdze7ixdny7hz24p4uma7uup7chdcpiumqznifqn4mpmqb b/torchinductor/aotautograd/azyih32olvhzuay5zpfypzhk2cdlosvaqxdhcnjzlwfs6k3a2ne6/5sz2kjdze7ixdny7hz24p4uma7uup7chdcpiumqznifqn4mpmqb new file mode 100644 index 0000000000000000000000000000000000000000..87f1a9ed39ece421a9c1c73639f23090ac3c8d71 Binary files /dev/null and b/torchinductor/aotautograd/azyih32olvhzuay5zpfypzhk2cdlosvaqxdhcnjzlwfs6k3a2ne6/5sz2kjdze7ixdny7hz24p4uma7uup7chdcpiumqznifqn4mpmqb differ diff --git a/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py b/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py new file mode 100644 index 0000000000000000000000000000000000000000..87fa8f5c1041dafcba46e5cf4f29b4c2186765e9 --- /dev/null +++ b/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py @@ -0,0 +1,73 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 256, 'r0_': 4096}, + reduction_hint=ReductionHint.INNER, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 6307840}} +) +@triton.jit +def triton_red_fused_add_mul_native_layer_norm_1(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 256 + r0_numel = 4096 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = xindex < xnumel + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = xindex + tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) + tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce( + tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0 + ) + tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean) + tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2) + tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight) + tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1) + tmp3 = tmp4[:, None] + tmp7 = tmp5[:, None] + tmp8 = tmp6[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp10 = 1.0 + tmp11 = tmp9 + tmp10 + tmp13 = tmp12.to(tl.float32) + tmp14 = tmp13 - tmp3 + tmp15 = 4096.0 + tmp16 = (tmp7 / tmp15) + tmp17 = 1e-06 + tmp18 = tmp16 + tmp17 + tmp19 = libdevice.rsqrt(tmp18) + tmp20 = tmp14 * tmp19 + tmp21 = tmp20.to(tl.float32) + tmp22 = tmp11 * tmp21 + tmp24 = tmp22 + tmp23 + tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask) diff --git a/torchinductor/av/d186a24d3c8af5514b42dea48fc981efd3f5afb7bba6c30406e42c75862888b1.best_config b/torchinductor/av/d186a24d3c8af5514b42dea48fc981efd3f5afb7bba6c30406e42c75862888b1.best_config new file mode 100644 index 0000000000000000000000000000000000000000..7c8326bc2575b9a99082ac834be7ee0544765495 --- /dev/null +++ b/torchinductor/av/d186a24d3c8af5514b42dea48fc981efd3f5afb7bba6c30406e42c75862888b1.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1, "R0_BLOCK": 4096, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 33, "triton_cache_hash": "CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA"} \ No newline at end of file diff --git a/torchinductor/ay/cayicsdjyjxzpcmkvjbneubnqkuhs3y37qiwy5qlel3z2loa4qav.py b/torchinductor/ay/cayicsdjyjxzpcmkvjbneubnqkuhs3y37qiwy5qlel3z2loa4qav.py new file mode 100644 index 0000000000000000000000000000000000000000..20443b971a2cf1b0b349bcc41a04f57441227ec2 --- /dev/null +++ b/torchinductor/ay/cayicsdjyjxzpcmkvjbneubnqkuhs3y37qiwy5qlel3z2loa4qav.py @@ -0,0 +1,69 @@ +# AOT ID: ['1_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1 = args + args.clear() + assert_size_stride(arg0_1, (4096, 12288), (1, 4096)) + assert_size_stride(arg1_1, (1, 1), (1, 1)) + return (aten.view.dtype(reinterpret_tensor(arg0_1, (12288, 4096), (4096, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((4096, 12288), (1, 4096), device='cuda:0', dtype=torch.float8_e4m3fn) + arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32) + fn = lambda: call([arg0_1, arg1_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/bv/7969eba2eb589b95d2894ee75ee67ba01cd2bee09cd64d315c70c0950888c19e.best_config b/torchinductor/bv/7969eba2eb589b95d2894ee75ee67ba01cd2bee09cd64d315c70c0950888c19e.best_config new file mode 100644 index 0000000000000000000000000000000000000000..e1fbfaeab573ca9626bf597a65efd90f0ec324ab --- /dev/null +++ b/torchinductor/bv/7969eba2eb589b95d2894ee75ee67ba01cd2bee09cd64d315c70c0950888c19e.best_config @@ -0,0 +1 @@ +{"XBLOCK": 2, "R0_BLOCK": 128, "num_warps": 2, "num_stages": 1, "configs_hash": "6ffa43f2ca8cb1499f3ff3fbf8c975f2c07eef9b57fcecda113029ab12cbef66", "found_by_coordesc": false, "time_taken_ms": 307, "triton_cache_hash": "AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A"} \ No newline at end of file diff --git a/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py b/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py new file mode 100644 index 0000000000000000000000000000000000000000..08757a34f04fdfbafcdc149b68963d655d5da3c3 --- /dev/null +++ b/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py @@ -0,0 +1,162 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 131072, 'r0_': 128}, + reduction_hint=ReductionHint.DEFAULT, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_out_ptr1': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0', 'mutated_arg_names': ['in_out_ptr0', 'in_out_ptr1'], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 16, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 0, 'r0_': 115606016}} +) +@triton.jit +def triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(in_out_ptr0, in_out_ptr1, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 73728 + r0_numel = 128 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = (xindex % 32) + x1 = xindex // 32 + _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) + x5 = xindex + _tmp10 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_2 = r0_index + tmp0 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp6 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tmp1 * tmp1 + tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) + tmp5 = _tmp4 + tmp3 + _tmp4 = tl.where(r0_mask, tmp5, _tmp4) + tmp7 = tmp6.to(tl.float32) + tmp8 = tmp7 * tmp7 + tmp9 = tl.broadcast_to(tmp8, [XBLOCK, R0_BLOCK]) + tmp11 = _tmp10 + tmp9 + _tmp10 = tl.where(r0_mask, tmp11, _tmp10) + tmp4 = tl.sum(_tmp4, 1)[:, None] + tmp10 = tl.sum(_tmp10, 1)[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_3 = (r0_index % 2) + r0_4 = r0_index // 2 + r0_2 = r0_index + tmp50 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp58 = tl.load(in_ptr1 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp63 = tl.load(in_ptr2 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0) + tmp66 = tl.load(in_ptr3 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0) + tmp96 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp102 = tl.load(in_ptr4 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp12 = r0_3 + tmp13 = tl.full([1, 1], 0, tl.int64) + tmp14 = tmp12 >= tmp13 + tmp15 = tl.full([1, 1], 1, tl.int64) + tmp16 = tmp12 < tmp15 + tmp17 = tl.load(in_ptr0 + (1 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp18 = tmp17.to(tl.float32) + tmp19 = 128.0 + tmp20 = (tmp10 / tmp19) + tmp21 = 1e-06 + tmp22 = tmp20 + tmp21 + tmp23 = libdevice.rsqrt(tmp22) + tmp24 = tmp18 * tmp23 + tmp25 = tl.load(in_ptr1 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp26 = tmp25.to(tl.float32) + tmp27 = tmp24 * tmp26 + tmp28 = tmp27.to(tl.float32) + tmp29 = -tmp28 + tmp30 = tl.full(tmp29.shape, 0.0, tmp29.dtype) + tmp31 = tl.where(tmp16, tmp29, tmp30) + tmp32 = tmp12 >= tmp15 + tmp33 = tl.full([1, 1], 2, tl.int64) + tmp34 = tmp12 < tmp33 + tmp35 = tl.load(in_ptr0 + (2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp36 = tmp35.to(tl.float32) + tmp37 = 128.0 + tmp38 = (tmp10 / tmp37) + tmp39 = 1e-06 + tmp40 = tmp38 + tmp39 + tmp41 = libdevice.rsqrt(tmp40) + tmp42 = tmp36 * tmp41 + tmp43 = tl.load(in_ptr1 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp44 = tmp43.to(tl.float32) + tmp45 = tmp42 * tmp44 + tmp46 = tmp45.to(tl.float32) + tmp47 = tl.full(tmp46.shape, 0.0, tmp46.dtype) + tmp48 = tl.where(tmp32, tmp46, tmp47) + tmp49 = tl.where(tmp16, tmp31, tmp48) + tmp51 = tmp50.to(tl.float32) + tmp52 = 128.0 + tmp53 = (tmp10 / tmp52) + tmp54 = 1e-06 + tmp55 = tmp53 + tmp54 + tmp56 = libdevice.rsqrt(tmp55) + tmp57 = tmp51 * tmp56 + tmp59 = tmp58.to(tl.float32) + tmp60 = tmp57 * tmp59 + tmp61 = tmp60.to(tl.float32) + tmp62 = tmp61.to(tl.float32) + tmp64 = tmp62 * tmp63 + tmp65 = tmp49.to(tl.float32) + tmp67 = tmp65 * tmp66 + tmp68 = tmp64 + tmp67 + tmp69 = tmp68.to(tl.float32) + tmp70 = tl.load(in_ptr0 + (4097 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp71 = tmp70.to(tl.float32) + tmp72 = (tmp4 / tmp19) + tmp73 = tmp72 + tmp21 + tmp74 = libdevice.rsqrt(tmp73) + tmp75 = tmp71 * tmp74 + tmp76 = tl.load(in_ptr4 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp77 = tmp76.to(tl.float32) + tmp78 = tmp75 * tmp77 + tmp79 = tmp78.to(tl.float32) + tmp80 = -tmp79 + tmp81 = tl.full(tmp80.shape, 0.0, tmp80.dtype) + tmp82 = tl.where(tmp16, tmp80, tmp81) + tmp83 = tl.load(in_ptr0 + (4096 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp84 = tmp83.to(tl.float32) + tmp85 = (tmp4 / tmp37) + tmp86 = tmp85 + tmp39 + tmp87 = libdevice.rsqrt(tmp86) + tmp88 = tmp84 * tmp87 + tmp89 = tl.load(in_ptr4 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp90 = tmp89.to(tl.float32) + tmp91 = tmp88 * tmp90 + tmp92 = tmp91.to(tl.float32) + tmp93 = tl.full(tmp92.shape, 0.0, tmp92.dtype) + tmp94 = tl.where(tmp32, tmp92, tmp93) + tmp95 = tl.where(tmp16, tmp82, tmp94) + tmp97 = tmp96.to(tl.float32) + tmp98 = (tmp4 / tmp52) + tmp99 = tmp98 + tmp54 + tmp100 = libdevice.rsqrt(tmp99) + tmp101 = tmp97 * tmp100 + tmp103 = tmp102.to(tl.float32) + tmp104 = tmp101 * tmp103 + tmp105 = tmp104.to(tl.float32) + tmp106 = tmp105.to(tl.float32) + tmp107 = tmp106 * tmp63 + tmp108 = tmp95.to(tl.float32) + tmp109 = tmp108 * tmp66 + tmp110 = tmp107 + tmp109 + tmp111 = tmp110.to(tl.float32) + tl.store(in_out_ptr0 + (r0_2 + 128*x5), tmp69, r0_mask) + tl.store(in_out_ptr1 + (r0_2 + 128*x5), tmp111, r0_mask) diff --git a/torchinductor/cr/ccr2gijy4jp6vvdbewmzgaogxbf5as7ytxtou4zo2yelawomrjjg.py b/torchinductor/cr/ccr2gijy4jp6vvdbewmzgaogxbf5as7ytxtou4zo2yelawomrjjg.py new file mode 100644 index 0000000000000000000000000000000000000000..f4c5bc00599317e3aa2565f0ca8806433600015c --- /dev/null +++ b/torchinductor/cr/ccr2gijy4jp6vvdbewmzgaogxbf5as7ytxtou4zo2yelawomrjjg.py @@ -0,0 +1,131 @@ +# AOT ID: ['21_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py +# Topologically Sorted Source Nodes: [chunk, silu, x], Original ATen: [aten.split, aten.silu, aten.mul] +# Source node to ATen node mapping: +# chunk => split +# silu => convert_element_type, convert_element_type_1, mul_6, sigmoid +# x => mul_10 +# Graph fragment: +# %arg1_1 : Tensor "bf16[1, s67, 24576][24576*s67, 24576, 1]cuda:0" = PlaceHolder[target=arg1_1] +# %split : [num_users=2] = call_function[target=torch.ops.aten.split.Tensor](args = (%arg1_1, 12288, -1), kwargs = {}) +# %convert_element_type : Tensor "f32[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem, torch.float32), kwargs = {}) +# %sigmoid : Tensor "f32[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type,), kwargs = {}) +# %mul_6 : Tensor "f32[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %sigmoid), kwargs = {}) +# %convert_element_type_1 : Tensor "bf16[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_6, torch.bfloat16), kwargs = {}) +# %mul_10 : Tensor "bf16[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %getitem_1), kwargs = {}) +# return %mul_10 +triton_poi_fused_mul_silu_split_0 = async_compile.triton('triton_poi_fused_mul_silu_split_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 4194304}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_split_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_mul_silu_split_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 12288) + x1 = xindex // 12288 + x2 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 24576*x1), None).to(tl.float32) + tmp5 = tl.load(in_ptr0 + (12288 + x0 + 24576*x1), None).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tl.sigmoid(tmp1) + tmp3 = tmp1 * tmp2 + tmp4 = tmp3.to(tl.float32) + tmp6 = tmp4 * tmp5 + tl.store(out_ptr0 + (x2), tmp6, None) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1 = args + args.clear() + s67 = arg0_1 + assert_size_stride(arg1_1, (1, s67, 24576), (24576*s67, 24576, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((1, s67, 12288), (12288*s67, 12288, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [chunk, silu, x], Original ATen: [aten.split, aten.silu, aten.mul] + triton_poi_fused_mul_silu_split_0_xnumel = 12288*s67 + stream0 = get_raw_stream(0) + triton_poi_fused_mul_silu_split_0.run(arg1_1, buf0, triton_poi_fused_mul_silu_split_0_xnumel, stream=stream0) + del arg1_1 + return (buf0, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = 256 + arg1_1 = rand_strided((1, 256, 24576), (6291456, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1, arg1_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/cz/bb6645c6be31f426023ec47eef09e354ad9fa8b2d59e6e45ab49b803eb34d44e.best_config b/torchinductor/cz/bb6645c6be31f426023ec47eef09e354ad9fa8b2d59e6e45ab49b803eb34d44e.best_config new file mode 100644 index 0000000000000000000000000000000000000000..00f2a256b252b5c28ebf908e7e25905ecf7b69a4 --- /dev/null +++ b/torchinductor/cz/bb6645c6be31f426023ec47eef09e354ad9fa8b2d59e6e45ab49b803eb34d44e.best_config @@ -0,0 +1 @@ +{"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 41, "triton_cache_hash": "SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA"} \ No newline at end of file diff --git a/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py b/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py new file mode 100644 index 0000000000000000000000000000000000000000..037b7f150f116a1a303491003b70151d640b4a75 --- /dev/null +++ b/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py @@ -0,0 +1,25 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 8388608}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 50331648}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_clone_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 8388608 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = xindex + tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32) + tl.store(out_ptr0 + (x0), tmp0, None) diff --git a/torchinductor/ei/289dba45a6462a57311a4b0e777a8c6425cf2b7e76724abdb2f263dfa285ce74.best_config b/torchinductor/ei/289dba45a6462a57311a4b0e777a8c6425cf2b7e76724abdb2f263dfa285ce74.best_config new file mode 100644 index 0000000000000000000000000000000000000000..9b2857e928afd2e15293e47e21a2ab312208306e --- /dev/null +++ b/torchinductor/ei/289dba45a6462a57311a4b0e777a8c6425cf2b7e76724abdb2f263dfa285ce74.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 42, "triton_cache_hash": "7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ"} \ No newline at end of file diff --git a/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py b/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py new file mode 100644 index 0000000000000000000000000000000000000000..ead91a76e47dbac191ed52e5f93858af9c6c6b90 --- /dev/null +++ b/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py @@ -0,0 +1,28 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 128) + x1 = ((xindex // 128) % 2304) + x2 = xindex // 294912 + x3 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 4096*x1), None).to(tl.float32) + tl.store(out_ptr0 + (x3), tmp0, None) diff --git a/torchinductor/fxgraph/22/f22et4hxfdbezzlil53lu2pcyq6hgd3sgpjtfwxqwvo3nhcfcvx3/iqgyrwyxmlbu22glkzz24rsbjiieww3sllxux2ttk4c6gtoiuba b/torchinductor/fxgraph/22/f22et4hxfdbezzlil53lu2pcyq6hgd3sgpjtfwxqwvo3nhcfcvx3/iqgyrwyxmlbu22glkzz24rsbjiieww3sllxux2ttk4c6gtoiuba new file mode 100644 index 0000000000000000000000000000000000000000..0e2205f612f4b99296b22b7c125d2ed420482c67 --- /dev/null +++ b/torchinductor/fxgraph/22/f22et4hxfdbezzlil53lu2pcyq6hgd3sgpjtfwxqwvo3nhcfcvx3/iqgyrwyxmlbu22glkzz24rsbjiieww3sllxux2ttk4c6gtoiuba @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:440d88db1762c34d68cb5673ae46414a104b5b725aef4bea735705e34dc8a041 +size 938428 diff --git a/torchinductor/fxgraph/53/f533iu6ruyb2fsdtxzocuyehmvcspxovsu3acdnbfpe47o3slvj3/hnzkkjjtkqsys574iwq2z2eutuefa26qvxdntknyhr2fw5tnbux b/torchinductor/fxgraph/53/f533iu6ruyb2fsdtxzocuyehmvcspxovsu3acdnbfpe47o3slvj3/hnzkkjjtkqsys574iwq2z2eutuefa26qvxdntknyhr2fw5tnbux new file mode 100644 index 0000000000000000000000000000000000000000..40a38fadf357d2b2a42bbe6d87004ac3c842811f Binary files /dev/null and b/torchinductor/fxgraph/53/f533iu6ruyb2fsdtxzocuyehmvcspxovsu3acdnbfpe47o3slvj3/hnzkkjjtkqsys574iwq2z2eutuefa26qvxdntknyhr2fw5tnbux differ diff --git a/torchinductor/fxgraph/6v/f6v2dym5xl4l4b2xlv35ic4ajld4mxcvhsvdsiwx2uug77q36cad/nhugjvtrt6cm53zizhnw673hj52m5m3kegaqvkjzkf4qh6d6rhm b/torchinductor/fxgraph/6v/f6v2dym5xl4l4b2xlv35ic4ajld4mxcvhsvdsiwx2uug77q36cad/nhugjvtrt6cm53zizhnw673hj52m5m3kegaqvkjzkf4qh6d6rhm new file mode 100644 index 0000000000000000000000000000000000000000..d2a337ea87cd19e16d55948c01b6065cb40f47c6 --- /dev/null +++ b/torchinductor/fxgraph/6v/f6v2dym5xl4l4b2xlv35ic4ajld4mxcvhsvdsiwx2uug77q36cad/nhugjvtrt6cm53zizhnw673hj52m5m3kegaqvkjzkf4qh6d6rhm @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69e864d6719f84ceef28c9db6f7f674f74ceb36a21810aa939517903f87e89d9 +size 264608 diff --git a/torchinductor/fxgraph/ae/faeqzqjo2ky5lpgrcqqhwk6jpyjxv7ptgmodvyw35laruvkwlabz/k5csppa65iixuznasdpjggbp2ltfolxpa4c5sryfyz5w6cz56xl b/torchinductor/fxgraph/ae/faeqzqjo2ky5lpgrcqqhwk6jpyjxv7ptgmodvyw35laruvkwlabz/k5csppa65iixuznasdpjggbp2ltfolxpa4c5sryfyz5w6cz56xl new file mode 100644 index 0000000000000000000000000000000000000000..df2581de9c449f550507a397a33b8e167e140dca Binary files /dev/null and b/torchinductor/fxgraph/ae/faeqzqjo2ky5lpgrcqqhwk6jpyjxv7ptgmodvyw35laruvkwlabz/k5csppa65iixuznasdpjggbp2ltfolxpa4c5sryfyz5w6cz56xl differ diff --git a/torchinductor/fxgraph/ah/fahbtdmoejcqs352pnbnedqns63nbnu6hdbrwzvf6chptnsannjh/fhpbiokcxxh7ksbfgiljcvh7erywotuv4ddvlfcb4fk2ef7dd5c b/torchinductor/fxgraph/ah/fahbtdmoejcqs352pnbnedqns63nbnu6hdbrwzvf6chptnsannjh/fhpbiokcxxh7ksbfgiljcvh7erywotuv4ddvlfcb4fk2ef7dd5c new file mode 100644 index 0000000000000000000000000000000000000000..5868ebbdbea1294f85ecab57918568a617c0f63b --- /dev/null +++ b/torchinductor/fxgraph/ah/fahbtdmoejcqs352pnbnedqns63nbnu6hdbrwzvf6chptnsannjh/fhpbiokcxxh7ksbfgiljcvh7erywotuv4ddvlfcb4fk2ef7dd5c @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29de143942bdcff5482532169154ff2471674e95e0c7559441e155a217e31f44 +size 174116 diff --git a/torchinductor/fxgraph/be/fbeg6ilh2lejqppgwjhp3rakthwkxiwywtjmfnc4ndj7bpmq6jb2/ug37hiwkgulfctc6yjfbmae23i6gakdvpvwwtn63w7z73byeze3 b/torchinductor/fxgraph/be/fbeg6ilh2lejqppgwjhp3rakthwkxiwywtjmfnc4ndj7bpmq6jb2/ug37hiwkgulfctc6yjfbmae23i6gakdvpvwwtn63w7z73byeze3 new file mode 100644 index 0000000000000000000000000000000000000000..b1e2a881de2b55d68047104b541a9e74116619e8 Binary files /dev/null and b/torchinductor/fxgraph/be/fbeg6ilh2lejqppgwjhp3rakthwkxiwywtjmfnc4ndj7bpmq6jb2/ug37hiwkgulfctc6yjfbmae23i6gakdvpvwwtn63w7z73byeze3 differ diff --git a/torchinductor/fxgraph/bn/fbnlruhvmagcngqd5is2xjbucjaq7uf3sgsbdahfi6ovtehbhzyo/62gizmmmqz43cclymnr7ftyo5qt7ux4og6bc2xazrwstkjpsy2e b/torchinductor/fxgraph/bn/fbnlruhvmagcngqd5is2xjbucjaq7uf3sgsbdahfi6ovtehbhzyo/62gizmmmqz43cclymnr7ftyo5qt7ux4og6bc2xazrwstkjpsy2e new file mode 100644 index 0000000000000000000000000000000000000000..5e07ad494683e012c6f3b650f3211d839bae6bab --- /dev/null +++ b/torchinductor/fxgraph/bn/fbnlruhvmagcngqd5is2xjbucjaq7uf3sgsbdahfi6ovtehbhzyo/62gizmmmqz43cclymnr7ftyo5qt7ux4og6bc2xazrwstkjpsy2e @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f68c8cb18c8679b109786363f2cf0eec27fa5f8e37822d5c198da53525f2c689 +size 3410265 diff --git a/torchinductor/fxgraph/bp/fbppvpvvibf6dmfslxtlr7rupsubjfcpto27skj4uv6z3rv72yqu/2yq6yplsvvilpuklnah5vfnufjdanewtyezrjycco3vxvnedvyc b/torchinductor/fxgraph/bp/fbppvpvvibf6dmfslxtlr7rupsubjfcpto27skj4uv6z3rv72yqu/2yq6yplsvvilpuklnah5vfnufjdanewtyezrjycco3vxvnedvyc new file mode 100644 index 0000000000000000000000000000000000000000..69838d1570a505e8ce653c7302cfb506f5e49550 Binary files /dev/null and b/torchinductor/fxgraph/bp/fbppvpvvibf6dmfslxtlr7rupsubjfcpto27skj4uv6z3rv72yqu/2yq6yplsvvilpuklnah5vfnufjdanewtyezrjycco3vxvnedvyc differ diff --git a/torchinductor/fxgraph/bt/fbtdosmtnvserozccxcnybd6tgd6ofwg2uxsp2mledlcicmt4qzk/wmthsneohmxumg7xfzujsqbo2fome7du5pzabiw6wmu4uoutbxc b/torchinductor/fxgraph/bt/fbtdosmtnvserozccxcnybd6tgd6ofwg2uxsp2mledlcicmt4qzk/wmthsneohmxumg7xfzujsqbo2fome7du5pzabiw6wmu4uoutbxc new file mode 100644 index 0000000000000000000000000000000000000000..632d6dfeab8f3fea047166214420d1f2cc21b144 Binary files /dev/null and b/torchinductor/fxgraph/bt/fbtdosmtnvserozccxcnybd6tgd6ofwg2uxsp2mledlcicmt4qzk/wmthsneohmxumg7xfzujsqbo2fome7du5pzabiw6wmu4uoutbxc differ diff --git a/torchinductor/fxgraph/cz/fczfijqqjco22ccfvbme7svhnvhj4do2m5f62tmd57h26f3hol7q/ysvactlqfhnxnqe57jpw2dhsxtia64lerdmzkfbsydfsf6ixr3m b/torchinductor/fxgraph/cz/fczfijqqjco22ccfvbme7svhnvhj4do2m5f62tmd57h26f3hol7q/ysvactlqfhnxnqe57jpw2dhsxtia64lerdmzkfbsydfsf6ixr3m new file mode 100644 index 0000000000000000000000000000000000000000..57efb12048afabcef85599b65a2b2c26fde76ada Binary files /dev/null and b/torchinductor/fxgraph/cz/fczfijqqjco22ccfvbme7svhnvhj4do2m5f62tmd57h26f3hol7q/ysvactlqfhnxnqe57jpw2dhsxtia64lerdmzkfbsydfsf6ixr3m differ diff --git a/torchinductor/fxgraph/dx/fdxnindsdqgodotc5a4cqrqpz34ijsi7rtays3feytoj4ko6orse/vantarodghbm737o5fovpz4x6sv5auvy47o366nk7ihggags3we b/torchinductor/fxgraph/dx/fdxnindsdqgodotc5a4cqrqpz34ijsi7rtays3feytoj4ko6orse/vantarodghbm737o5fovpz4x6sv5auvy47o366nk7ihggags3we new file mode 100644 index 0000000000000000000000000000000000000000..1b81067c9ec7e3f0c81cdf82aecd4675c710626a Binary files /dev/null and b/torchinductor/fxgraph/dx/fdxnindsdqgodotc5a4cqrqpz34ijsi7rtays3feytoj4ko6orse/vantarodghbm737o5fovpz4x6sv5auvy47o366nk7ihggags3we differ diff --git a/torchinductor/fxgraph/e2/fe2tjoiexjbavh5sakfaxvga43vsvwn5ev5bzhfjg76jvmtjqtbn/ejg3u4qymaxsvvl2vdequli7pwsrdjf5zdgqjkgbrxdsvgfv3h4 b/torchinductor/fxgraph/e2/fe2tjoiexjbavh5sakfaxvga43vsvwn5ev5bzhfjg76jvmtjqtbn/ejg3u4qymaxsvvl2vdequli7pwsrdjf5zdgqjkgbrxdsvgfv3h4 new file mode 100644 index 0000000000000000000000000000000000000000..5cc300fa0cf99c368a0f9e44626e1b8da195cbcf --- /dev/null +++ b/torchinductor/fxgraph/e2/fe2tjoiexjbavh5sakfaxvga43vsvwn5ev5bzhfjg76jvmtjqtbn/ejg3u4qymaxsvvl2vdequli7pwsrdjf5zdgqjkgbrxdsvgfv3h4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:224dba7218602f2ad57aa8c90a2d1f7da511a4bdc8cd04a8c18dc72a98b5d9f8 +size 563115 diff --git a/torchinductor/fxgraph/fr/ffrx7clryowwzulnhruopihutvaxlycymqopsyoha6yecifyw2m2/g3wh462wylribiwz4th3gnlt5rtnrcb5bkad6w3yucxodv3q5ks b/torchinductor/fxgraph/fr/ffrx7clryowwzulnhruopihutvaxlycymqopsyoha6yecifyw2m2/g3wh462wylribiwz4th3gnlt5rtnrcb5bkad6w3yucxodv3q5ks new file mode 100644 index 0000000000000000000000000000000000000000..b1e00fa1e977528213190d06d2d90a7e77b46d8b --- /dev/null +++ b/torchinductor/fxgraph/fr/ffrx7clryowwzulnhruopihutvaxlycymqopsyoha6yecifyw2m2/g3wh462wylribiwz4th3gnlt5rtnrcb5bkad6w3yucxodv3q5ks @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36ec7e7b56c2e280a2d9e4cfb33573ec66d8883d0a803f5b78a0aee1d770eaa4 +size 2958304 diff --git a/torchinductor/fxgraph/gl/fglggif4cconx3mjhni5xdqsapr365j6lcigwntolt36ggvy6xoa/lldkvrm3bfx4l4wyn6ric67cviyotwhuua4w7xtue3ki6zihdme b/torchinductor/fxgraph/gl/fglggif4cconx3mjhni5xdqsapr365j6lcigwntolt36ggvy6xoa/lldkvrm3bfx4l4wyn6ric67cviyotwhuua4w7xtue3ki6zihdme new file mode 100644 index 0000000000000000000000000000000000000000..ebe07680a5bda8ebe71415af07da7cfe3e9e5cdf Binary files /dev/null and b/torchinductor/fxgraph/gl/fglggif4cconx3mjhni5xdqsapr365j6lcigwntolt36ggvy6xoa/lldkvrm3bfx4l4wyn6ric67cviyotwhuua4w7xtue3ki6zihdme differ diff --git a/torchinductor/fxgraph/gu/fguzn264ahmz2nwbeukdevusbfubyrjhr676ynmcbas4v5s55wdq/tkigttpzup32xmjthoydmqv2wmbdw54qhikri52mbatcy6pdgfn b/torchinductor/fxgraph/gu/fguzn264ahmz2nwbeukdevusbfubyrjhr676ynmcbas4v5s55wdq/tkigttpzup32xmjthoydmqv2wmbdw54qhikri52mbatcy6pdgfn new file mode 100644 index 0000000000000000000000000000000000000000..070460ccc17ab89c37126471d606733b7b721b20 Binary files /dev/null and b/torchinductor/fxgraph/gu/fguzn264ahmz2nwbeukdevusbfubyrjhr676ynmcbas4v5s55wdq/tkigttpzup32xmjthoydmqv2wmbdw54qhikri52mbatcy6pdgfn differ diff --git a/torchinductor/fxgraph/gw/fgwoasterkg46eom5rlkmnxomajflnxlkp6lj2kuc6echyghg5bu/rlivt46ououdjetwpsd2yu3fw57nwakjhkk5y5jpf23qnhfyqhf b/torchinductor/fxgraph/gw/fgwoasterkg46eom5rlkmnxomajflnxlkp6lj2kuc6echyghg5bu/rlivt46ououdjetwpsd2yu3fw57nwakjhkk5y5jpf23qnhfyqhf new file mode 100644 index 0000000000000000000000000000000000000000..5b499e4cb925ea6a0ce22b810c01095d249a3c78 Binary files /dev/null and b/torchinductor/fxgraph/gw/fgwoasterkg46eom5rlkmnxomajflnxlkp6lj2kuc6echyghg5bu/rlivt46ououdjetwpsd2yu3fw57nwakjhkk5y5jpf23qnhfyqhf differ diff --git a/torchinductor/fxgraph/gz/fgzy6konies5s7bd2elbil2jcj7zkgvoixpugnguncnrarcnahdq/kwskkw5g6u7ncnmnxj2mbgxghzm3bgayjlxguhamtpzc6earoeu b/torchinductor/fxgraph/gz/fgzy6konies5s7bd2elbil2jcj7zkgvoixpugnguncnrarcnahdq/kwskkw5g6u7ncnmnxj2mbgxghzm3bgayjlxguhamtpzc6earoeu new file mode 100644 index 0000000000000000000000000000000000000000..c74836742af2cbb7a6352125234617132230ef83 Binary files /dev/null and b/torchinductor/fxgraph/gz/fgzy6konies5s7bd2elbil2jcj7zkgvoixpugnguncnrarcnahdq/kwskkw5g6u7ncnmnxj2mbgxghzm3bgayjlxguhamtpzc6earoeu differ diff --git a/torchinductor/fxgraph/ji/fjiy5thw24pernaeujhzsyccltrmsf63h2zi7ew3s3fjuoxo4q3d/xrye5rdod7ebchmrpuw4t3cg5he7cgyiz3u3pztqzvylr4vlhl3 b/torchinductor/fxgraph/ji/fjiy5thw24pernaeujhzsyccltrmsf63h2zi7ew3s3fjuoxo4q3d/xrye5rdod7ebchmrpuw4t3cg5he7cgyiz3u3pztqzvylr4vlhl3 new file mode 100644 index 0000000000000000000000000000000000000000..5b8c0bced4d61ed7b530ee1d814775bc4b7ffc48 Binary files /dev/null and b/torchinductor/fxgraph/ji/fjiy5thw24pernaeujhzsyccltrmsf63h2zi7ew3s3fjuoxo4q3d/xrye5rdod7ebchmrpuw4t3cg5he7cgyiz3u3pztqzvylr4vlhl3 differ diff --git a/torchinductor/fxgraph/k6/fk6cfyjfeiu7xe6ebkapsnixuplqczgfc5534mitqsfkssbzjyak/4xid4w6sg2yg7xaseouf2vwhp2fyff56a2t6z6ownb3yw3g25rk b/torchinductor/fxgraph/k6/fk6cfyjfeiu7xe6ebkapsnixuplqczgfc5534mitqsfkssbzjyak/4xid4w6sg2yg7xaseouf2vwhp2fyff56a2t6z6ownb3yw3g25rk new file mode 100644 index 0000000000000000000000000000000000000000..466573cabf40c8fd52d28cceeb01ac364bc6d9ab --- /dev/null +++ b/torchinductor/fxgraph/k6/fk6cfyjfeiu7xe6ebkapsnixuplqczgfc5534mitqsfkssbzjyak/4xid4w6sg2yg7xaseouf2vwhp2fyff56a2t6z6ownb3yw3g25rk @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5d03e5bd236b06fdc1223a85d56c77e8b8297be06a7ecf9d668778b6cdaec55 +size 333975 diff --git a/torchinductor/fxgraph/kj/fkjh2kykxecmnv6oe3zzwtjpek77nmrm35vgv2daxfgkim6xfk4u/gigbnwpmixz5epksvvrh4mtg3nxlpy724eojb3ajzvtepgvx7y4 b/torchinductor/fxgraph/kj/fkjh2kykxecmnv6oe3zzwtjpek77nmrm35vgv2daxfgkim6xfk4u/gigbnwpmixz5epksvvrh4mtg3nxlpy724eojb3ajzvtepgvx7y4 new file mode 100644 index 0000000000000000000000000000000000000000..8a4cbdd00bc66ae731c68bfe418b5637f4f60fd1 --- /dev/null +++ b/torchinductor/fxgraph/kj/fkjh2kykxecmnv6oe3zzwtjpek77nmrm35vgv2daxfgkim6xfk4u/gigbnwpmixz5epksvvrh4mtg3nxlpy724eojb3ajzvtepgvx7y4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:320c16d9ec45f3d23d52ad627e3266db6eb7e3fae11c90ec09cd66479ab7fe38 +size 205255 diff --git a/torchinductor/fxgraph/n6/fn6x7m44e35jdmh6iqj3eqiyrz7tbhzd3rqartt67myyrnickjmp/um3sgirsxogup4murdiaoy7dxu4ogolqsa343kh56kq24zd53fb b/torchinductor/fxgraph/n6/fn6x7m44e35jdmh6iqj3eqiyrz7tbhzd3rqartt67myyrnickjmp/um3sgirsxogup4murdiaoy7dxu4ogolqsa343kh56kq24zd53fb new file mode 100644 index 0000000000000000000000000000000000000000..132619ad9b2fd103025e96232885c65b686fa2d5 --- /dev/null +++ b/torchinductor/fxgraph/n6/fn6x7m44e35jdmh6iqj3eqiyrz7tbhzd3rqartt67myyrnickjmp/um3sgirsxogup4murdiaoy7dxu4ogolqsa343kh56kq24zd53fb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a337232232bb8d47f19488d00763e3bd38e339709037cda8fdf2a1ae647dd943 +size 680412 diff --git a/torchinductor/fxgraph/te/fte5y7bccssideiluepvpscj6srf7orxnfgql6to32ni27zf2uv2/vmrpdvw3meiqsf22oras6imorrybogkgp6jjr3ddtreaiuutais b/torchinductor/fxgraph/te/fte5y7bccssideiluepvpscj6srf7orxnfgql6to32ni27zf2uv2/vmrpdvw3meiqsf22oras6imorrybogkgp6jjr3ddtreaiuutais new file mode 100644 index 0000000000000000000000000000000000000000..17dbb2435826af317d46344bc3983dd025e7fbeb --- /dev/null +++ b/torchinductor/fxgraph/te/fte5y7bccssideiluepvpscj6srf7orxnfgql6to32ni27zf2uv2/vmrpdvw3meiqsf22oras6imorrybogkgp6jjr3ddtreaiuutais @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab22f1d6db611109175a74412f218e8c701719467f9298ec639c480452930224 +size 205639 diff --git a/torchinductor/fxgraph/tz/ftzd5ordyehsowurkwjjpkso24gayhyplcd6wz7xdv53fad276l6/36luuy7klcmb7554z63umrezysn6xbat5wxfaadxh4clxhcn2j7 b/torchinductor/fxgraph/tz/ftzd5ordyehsowurkwjjpkso24gayhyplcd6wz7xdv53fad276l6/36luuy7klcmb7554z63umrezysn6xbat5wxfaadxh4clxhcn2j7 new file mode 100644 index 0000000000000000000000000000000000000000..322a4378fd8ab9504083cc49c4ddd315fcdd0523 --- /dev/null +++ b/torchinductor/fxgraph/tz/ftzd5ordyehsowurkwjjpkso24gayhyplcd6wz7xdv53fad276l6/36luuy7klcmb7554z63umrezysn6xbat5wxfaadxh4clxhcn2j7 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df974a63ea58981ff7bccfb7464499c49beb8413edae5000773f04bb9c4dd27f +size 259431 diff --git a/torchinductor/fxgraph/u4/fu47dchf76mmiajgnawm3xgek4ysnnmqaavupgy4cddyxygid6iq/725pjrxppjygb6kbca6zklwmn7iunv65thw23b5s4im6zi27j3i b/torchinductor/fxgraph/u4/fu47dchf76mmiajgnawm3xgek4ysnnmqaavupgy4cddyxygid6iq/725pjrxppjygb6kbca6zklwmn7iunv65thw23b5s4im6zi27j3i new file mode 100644 index 0000000000000000000000000000000000000000..da88a3f349ecadc5f4ff97bd5b99f19f443bd557 --- /dev/null +++ b/torchinductor/fxgraph/u4/fu47dchf76mmiajgnawm3xgek4ysnnmqaavupgy4cddyxygid6iq/725pjrxppjygb6kbca6zklwmn7iunv65thw23b5s4im6zi27j3i @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:febaf4c6ef7a7060f941103d952ecc6fd146d7dd99edad87b2e219eca35f4ed0 +size 498357 diff --git a/torchinductor/fxgraph/uy/fuygegwmldon4qz3wvjs3cld4hnjz6yxh6aa2cmfsal4u3xxws43/l4w2mroymid3qdffvzt4wffavpm5it6rzi6lmbvxzzezfkbavuo b/torchinductor/fxgraph/uy/fuygegwmldon4qz3wvjs3cld4hnjz6yxh6aa2cmfsal4u3xxws43/l4w2mroymid3qdffvzt4wffavpm5it6rzi6lmbvxzzezfkbavuo new file mode 100644 index 0000000000000000000000000000000000000000..372e7c807b708faadc27d59dd4e223b2ba9cf83d --- /dev/null +++ b/torchinductor/fxgraph/uy/fuygegwmldon4qz3wvjs3cld4hnjz6yxh6aa2cmfsal4u3xxws43/l4w2mroymid3qdffvzt4wffavpm5it6rzi6lmbvxzzezfkbavuo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f2da645d86207b80ca5ae67cb14a0abd9d44fd1ca3cb606b7ce4992a820ad1d +size 174161 diff --git a/torchinductor/fxgraph/w5/fw5vzdkweh3kv3fm3mnal4wu63gxhw2anwx2pzuved4acfz4fdzm/n5dfreyro3slkufuydw2d54nm7bfiskxjjasoyg2z5yept5c3rf b/torchinductor/fxgraph/w5/fw5vzdkweh3kv3fm3mnal4wu63gxhw2anwx2pzuved4acfz4fdzm/n5dfreyro3slkufuydw2d54nm7bfiskxjjasoyg2z5yept5c3rf new file mode 100644 index 0000000000000000000000000000000000000000..f8d3a44d8909452ae95e74de5fba7ffe05822f3b --- /dev/null +++ b/torchinductor/fxgraph/w5/fw5vzdkweh3kv3fm3mnal4wu63gxhw2anwx2pzuved4acfz4fdzm/n5dfreyro3slkufuydw2d54nm7bfiskxjjasoyg2z5yept5c3rf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f4658931176e4b550b4c0eda1f78d67c25449574a412760dacf7047cfa2dc4b +size 128456 diff --git a/torchinductor/g3/cg3nh42xzsnf5ms7nmsutyrbxuujlzqikaduzqov5wb5mzikcdj6.py b/torchinductor/g3/cg3nh42xzsnf5ms7nmsutyrbxuujlzqikaduzqov5wb5mzikcdj6.py new file mode 100644 index 0000000000000000000000000000000000000000..22721c7f7dd98857316528b9462eee997164cd80 --- /dev/null +++ b/torchinductor/g3/cg3nh42xzsnf5ms7nmsutyrbxuujlzqikaduzqov5wb5mzikcdj6.py @@ -0,0 +1,67 @@ +# AOT ID: ['7_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, = args + args.clear() + assert_size_stride(arg0_1, (1, 256, 12288), (3145728, 12288, 1)) + return (reinterpret_tensor(arg0_1, (1, 256, 4096), (3145728, 12288, 1), 0), reinterpret_tensor(arg0_1, (1, 256, 4096), (3145728, 12288, 1), 4096), reinterpret_tensor(arg0_1, (1, 256, 4096), (3145728, 12288, 1), 8192), ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 256, 12288), (3145728, 12288, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/gq/cgqvtct76tcr46zepwtpzklusornjsauaa2yqljz4nersnfxryln.py b/torchinductor/gq/cgqvtct76tcr46zepwtpzklusornjsauaa2yqljz4nersnfxryln.py new file mode 100644 index 0000000000000000000000000000000000000000..7914008e7b95f2c23fbb8d37c5661b6953b0beb5 --- /dev/null +++ b/torchinductor/gq/cgqvtct76tcr46zepwtpzklusornjsauaa2yqljz4nersnfxryln.py @@ -0,0 +1,129 @@ +# AOT ID: ['29_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py +# Topologically Sorted Source Nodes: [mul, hidden_states], Original ATen: [aten.mul, aten.add] +# Source node to ATen node mapping: +# hidden_states => add +# mul => mul +# Graph fragment: +# %arg2_1 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0" = PlaceHolder[target=arg2_1] +# %arg0_1 : Tensor "bf16[1, 1, 4096][12288, 12288, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %arg1_1 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0" = PlaceHolder[target=arg1_1] +# %mul : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg0_1, %arg1_1), kwargs = {}) +# %add : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg2_1, %mul), kwargs = {}) +# return %add +triton_poi_fused_add_mul_0 = async_compile.triton('triton_poi_fused_add_mul_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 75505664}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_add_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x2 = xindex + x0 = (xindex % 4096) + tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) + tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32) + tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = tmp0 + tmp3 + tl.store(out_ptr0 + (x2), tmp4, None) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1, arg2_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 1, 4096), (12288, 12288, 1)) + assert_size_stride(arg1_1, (1, 2304, 4096), (9437184, 4096, 1)) + assert_size_stride(arg2_1, (1, 2304, 4096), (9437184, 4096, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((1, 2304, 4096), (9437184, 4096, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [mul, hidden_states], Original ATen: [aten.mul, aten.add] + stream0 = get_raw_stream(0) + triton_poi_fused_add_mul_0.run(arg2_1, arg0_1, arg1_1, buf0, 9437184, stream=stream0) + del arg0_1 + del arg1_1 + del arg2_1 + return (buf0, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 1, 4096), (12288, 12288, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((1, 2304, 4096), (9437184, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + arg2_1 = rand_strided((1, 2304, 4096), (9437184, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1, arg1_1, arg2_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py b/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py new file mode 100644 index 0000000000000000000000000000000000000000..f21f0db40715806b347caf193f7b01984a76025b --- /dev/null +++ b/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py @@ -0,0 +1,28 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_permute_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_clone_permute_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 128) + x1 = ((xindex // 128) % 32) + x2 = xindex // 4096 + x3 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 294912*x1), None).to(tl.float32) + tl.store(out_ptr0 + (x3), tmp0, None) diff --git a/torchinductor/j4/f4615e42c3266e11c9ea4b5f133496e24bb48471321801b15c4cd2969aeb5f72.best_config b/torchinductor/j4/f4615e42c3266e11c9ea4b5f133496e24bb48471321801b15c4cd2969aeb5f72.best_config new file mode 100644 index 0000000000000000000000000000000000000000..dba0954936470862c616d97fdae31ec321ffffe2 --- /dev/null +++ b/torchinductor/j4/f4615e42c3266e11c9ea4b5f133496e24bb48471321801b15c4cd2969aeb5f72.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 43, "triton_cache_hash": "LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ"} \ No newline at end of file diff --git a/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py b/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py new file mode 100644 index 0000000000000000000000000000000000000000..992a95b38fa088fd235ef759bf166e0b1f20cfe2 --- /dev/null +++ b/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py @@ -0,0 +1,64 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 8, 'num_store': 2, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 115605504}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x3 = xindex + x0 = (xindex % 128) + x2 = xindex // 4096 + x4 = xindex // 128 + tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) + tmp2 = tl.load(in_ptr1 + (x0 + 128*x2), None, eviction_policy='evict_last') + tmp19 = tl.load(in_ptr2 + (x0 + 128*x2), None, eviction_policy='evict_last') + tmp23 = tl.load(in_ptr3 + (x3), None).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = (x3 % 2) + tmp5 = tl.full([1], 0, tl.int64) + tmp6 = tmp4 >= tmp5 + tmp7 = tl.full([1], 1, tl.int64) + tmp8 = tmp4 < tmp7 + tmp9 = tl.load(in_ptr0 + (1 + 2*(x0 // 2) + 128*x4), tmp8, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp10 = -tmp9 + tmp11 = tl.full(tmp10.shape, 0.0, tmp10.dtype) + tmp12 = tl.where(tmp8, tmp10, tmp11) + tmp13 = tmp4 >= tmp7 + tmp14 = tl.full([1], 2, tl.int64) + tmp15 = tmp4 < tmp14 + tmp16 = tl.load(in_ptr0 + (2*(x0 // 2) + 128*x4), tmp13, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp17 = tl.where(tmp8, tmp12, tmp16) + tmp18 = tmp17.to(tl.float32) + tmp20 = tmp18 * tmp19 + tmp21 = tmp3 + tmp20 + tmp22 = tmp21.to(tl.float32) + tmp24 = tmp23.to(tl.float32) + tmp25 = tmp24 * tmp2 + tmp26 = tl.load(in_ptr3 + (1 + 2*(x0 // 2) + 128*x4), tmp8, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp27 = -tmp26 + tmp28 = tl.full(tmp27.shape, 0.0, tmp27.dtype) + tmp29 = tl.where(tmp8, tmp27, tmp28) + tmp30 = tl.load(in_ptr3 + (2*(x0 // 2) + 128*x4), tmp13, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp31 = tl.where(tmp8, tmp29, tmp30) + tmp32 = tmp31.to(tl.float32) + tmp33 = tmp32 * tmp19 + tmp34 = tmp25 + tmp33 + tmp35 = tmp34.to(tl.float32) + tl.store(out_ptr0 + (x3), tmp22, None) + tl.store(out_ptr1 + (x3), tmp35, None) diff --git a/torchinductor/j6/fc31ff8ea0e88de49e017c5dfd904cd4aaecfaa99b40a6e7893ff786df367c1f.best_config b/torchinductor/j6/fc31ff8ea0e88de49e017c5dfd904cd4aaecfaa99b40a6e7893ff786df367c1f.best_config new file mode 100644 index 0000000000000000000000000000000000000000..cd6cae08e89399ea871f8bc0c275150fb3afa4d6 --- /dev/null +++ b/torchinductor/j6/fc31ff8ea0e88de49e017c5dfd904cd4aaecfaa99b40a6e7893ff786df367c1f.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 51, "triton_cache_hash": "PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ"} \ No newline at end of file diff --git a/torchinductor/jb/cjbidoaebunoc2ogxudwkej7mv2qtrm646yq6aqkxnupjaa2gjpm.py b/torchinductor/jb/cjbidoaebunoc2ogxudwkej7mv2qtrm646yq6aqkxnupjaa2gjpm.py new file mode 100644 index 0000000000000000000000000000000000000000..cc7ec964475e2e8913a5eba64af989d6dca4fd1b --- /dev/null +++ b/torchinductor/jb/cjbidoaebunoc2ogxudwkej7mv2qtrm646yq6aqkxnupjaa2gjpm.py @@ -0,0 +1,69 @@ +# AOT ID: ['24_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1 = args + args.clear() + assert_size_stride(arg0_1, (4096, 36864), (1, 4096)) + assert_size_stride(arg1_1, (1, 1), (1, 1)) + return (aten.view.dtype(reinterpret_tensor(arg0_1, (36864, 4096), (4096, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((4096, 36864), (1, 4096), device='cuda:0', dtype=torch.float8_e4m3fn) + arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32) + fn = lambda: call([arg0_1, arg1_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/l6/cl6bgn7zrhznjdcy5hbh2jf4gtl5wugmqgueeot2jxpfdygccy32.py b/torchinductor/l6/cl6bgn7zrhznjdcy5hbh2jf4gtl5wugmqgueeot2jxpfdygccy32.py new file mode 100644 index 0000000000000000000000000000000000000000..0b955216b6ee1a9fe8eef5addaef9450d0f023c4 --- /dev/null +++ b/torchinductor/l6/cl6bgn7zrhznjdcy5hbh2jf4gtl5wugmqgueeot2jxpfdygccy32.py @@ -0,0 +1,69 @@ +# AOT ID: ['18_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1 = args + args.clear() + s3 = arg0_1 + s52 = arg1_1 + return (s3*s52, s3, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = 12288 + arg1_1 = 2048 + fn = lambda: call([arg0_1, arg1_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/lm/clmvm4rjx26qkbhmxge35no7qn6s4jcf5wpuzu4xdj2bl5fufzn5.py b/torchinductor/lm/clmvm4rjx26qkbhmxge35no7qn6s4jcf5wpuzu4xdj2bl5fufzn5.py new file mode 100644 index 0000000000000000000000000000000000000000..21c5c36f25ad59c064e9c28d2bbe36bac7032a9b --- /dev/null +++ b/torchinductor/lm/clmvm4rjx26qkbhmxge35no7qn6s4jcf5wpuzu4xdj2bl5fufzn5.py @@ -0,0 +1,66 @@ +# AOT ID: ['3_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((4194304, ), (1, ), torch.uint8) + return (buf0, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + fn = lambda: call([]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py b/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py new file mode 100644 index 0000000000000000000000000000000000000000..d92d5d0cc033c61120e42219ad20e6f3844e53d1 --- /dev/null +++ b/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py @@ -0,0 +1,37 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_view_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 75497472}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_cat_view_4(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x1 = xindex // 4096 + x0 = (xindex % 4096) + x2 = xindex + tmp0 = x1 + tmp1 = tl.full([1], 0, tl.int64) + tmp2 = tmp0 >= tmp1 + tmp3 = tl.full([1], 256, tl.int64) + tmp4 = tmp0 < tmp3 + tmp5 = tl.load(in_ptr0 + (x0 + 12288*(x1)), tmp4, other=0.0).to(tl.float32) + tmp6 = tmp0 >= tmp3 + tmp7 = tl.full([1], 2304, tl.int64) + tmp8 = tmp0 < tmp7 + tmp9 = tl.load(in_ptr1 + (x0 + 12288*((-256) + x1)), tmp6, other=0.0).to(tl.float32) + tmp10 = tl.where(tmp4, tmp5, tmp9) + tl.store(out_ptr0 + (x2), tmp10, None) diff --git a/torchinductor/lp/e492dcb4532db2d7228670fc17c874efe5ebefa061173fe11c857897719b4f8a.best_config b/torchinductor/lp/e492dcb4532db2d7228670fc17c874efe5ebefa061173fe11c857897719b4f8a.best_config new file mode 100644 index 0000000000000000000000000000000000000000..a0851c3c0a82173d9ff3ae08fb6c10a2d0294257 --- /dev/null +++ b/torchinductor/lp/e492dcb4532db2d7228670fc17c874efe5ebefa061173fe11c857897719b4f8a.best_config @@ -0,0 +1 @@ +{"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 42, "triton_cache_hash": "P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ"} \ No newline at end of file diff --git a/torchinductor/m7/cm7pu7u7om2jn3kzchwei42hx5yogcrhxykjyf2psczxqpq6pqgn.py b/torchinductor/m7/cm7pu7u7om2jn3kzchwei42hx5yogcrhxykjyf2psczxqpq6pqgn.py new file mode 100644 index 0000000000000000000000000000000000000000..81545237becae7bc96a78611831d49204aa9669c --- /dev/null +++ b/torchinductor/m7/cm7pu7u7om2jn3kzchwei42hx5yogcrhxykjyf2psczxqpq6pqgn.py @@ -0,0 +1,114 @@ +# AOT ID: ['13_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py +# Topologically Sorted Source Nodes: [hidden_states], Original ATen: [aten.clone] +# Source node to ATen node mapping: +# hidden_states => clone +# Graph fragment: +# %arg0_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %clone : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%arg0_1,), kwargs = {}) +# return %clone +triton_poi_fused_clone_0 = async_compile.triton('triton_poi_fused_clone_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 8388608}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 50331648}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_clone_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 8388608 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = xindex + tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32) + tl.store(out_ptr0 + (x0), tmp0, None) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, = args + args.clear() + assert_size_stride(arg0_1, (1, 2048, 4096), (8388608, 4096, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((1, 2048, 4096), (8388608, 4096, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [hidden_states], Original ATen: [aten.clone] + stream0 = get_raw_stream(0) + triton_poi_fused_clone_0.run(arg0_1, buf0, 8388608, stream=stream0) + del arg0_1 + return (buf0, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/mm/cmm7cgnyfzmeqiuyvf2c3y53lnswvir5imajipe4ngulpefdv4cb.py b/torchinductor/mm/cmm7cgnyfzmeqiuyvf2c3y53lnswvir5imajipe4ngulpefdv4cb.py new file mode 100644 index 0000000000000000000000000000000000000000..6b0c94ad2ab2866a454a91c2a0f800016c4f4ed1 --- /dev/null +++ b/torchinductor/mm/cmm7cgnyfzmeqiuyvf2c3y53lnswvir5imajipe4ngulpefdv4cb.py @@ -0,0 +1,69 @@ +# AOT ID: ['12_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1 = args + args.clear() + assert_size_stride(arg0_1, (4096, 4096), (1, 4096)) + assert_size_stride(arg1_1, (1, 1), (1, 1)) + return (aten.view.dtype(reinterpret_tensor(arg0_1, (4096, 4096), (4096, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((4096, 4096), (1, 4096), device='cuda:0', dtype=torch.float8_e4m3fn) + arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32) + fn = lambda: call([arg0_1, arg1_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/nm/cnmrfsgnv5zogcysnfemjfhv46bzzfsonihlzpnx6hdtwdd5t3pl.py b/torchinductor/nm/cnmrfsgnv5zogcysnfemjfhv46bzzfsonihlzpnx6hdtwdd5t3pl.py new file mode 100644 index 0000000000000000000000000000000000000000..4beab6ba6cd9510e88c36f2510bdfd5565ea494f --- /dev/null +++ b/torchinductor/nm/cnmrfsgnv5zogcysnfemjfhv46bzzfsonihlzpnx6hdtwdd5t3pl.py @@ -0,0 +1,184 @@ +# AOT ID: ['23_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py +# Topologically Sorted Source Nodes: [norm_hidden_states, add, mul, norm_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul] +# Source node to ATen node mapping: +# add => add_1 +# mul => mul_1 +# norm_hidden_states => add, convert_element_type, convert_element_type_1, mul, rsqrt, sub, var_mean +# norm_hidden_states_1 => add_2 +# Graph fragment: +# %arg0_1 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %arg1_1 : Tensor "bf16[1, 1, 4096][12288, 12288, 1]cuda:0" = PlaceHolder[target=arg1_1] +# %getitem_1 : Tensor "f32[1, 2304, 1][2304, 1, 2304]cuda:0" = PlaceHolder[target=getitem_1] +# %buf1 : Tensor "f32[1, 2304, 1][2304, 1, 2304]cuda:0" = PlaceHolder[target=buf1] +# %arg2_1 : Tensor "bf16[1, 1, 4096][12288, 12288, 1]cuda:0" = PlaceHolder[target=arg2_1] +# %convert_element_type : Tensor "f32[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg0_1, torch.float32), kwargs = {}) +# %var_mean : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type, [2]), kwargs = {correction: 0, keepdim: True}) +# %add_1 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg1_1, 1), kwargs = {}) +# %sub : Tensor "f32[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type, %getitem_1), kwargs = {}) +# %add : Tensor "f32[1, 2304, 1][2304, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem, 1e-06), kwargs = {}) +# %rsqrt : Tensor "f32[1, 2304, 1][2304, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {}) +# %mul : Tensor "f32[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, %rsqrt), kwargs = {}) +# %convert_element_type_1 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {}) +# %mul_1 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_1, %convert_element_type_1), kwargs = {}) +# %add_2 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_1, %arg2_1), kwargs = {}) +# return %getitem_1,%buf1,%add_2 +triton_red_fused_add_mul_native_layer_norm_0 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 4096, 'r0_': 4096}, + reduction_hint=ReductionHint.INNER, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 56639488}} +) +@triton.jit +def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 2304 + r0_numel = 4096 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = xindex < xnumel + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = xindex + tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) + tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce( + tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0 + ) + tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean) + tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2) + tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight) + tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1) + tmp3 = tmp4[:, None] + tmp7 = tmp5[:, None] + tmp8 = tmp6[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp10 = 1.0 + tmp11 = tmp9 + tmp10 + tmp13 = tmp12.to(tl.float32) + tmp14 = tmp13 - tmp3 + tmp15 = 4096.0 + tmp16 = (tmp7 / tmp15) + tmp17 = 1e-06 + tmp18 = tmp16 + tmp17 + tmp19 = libdevice.rsqrt(tmp18) + tmp20 = tmp14 * tmp19 + tmp21 = tmp20.to(tl.float32) + tmp22 = tmp11 * tmp21 + tmp24 = tmp22 + tmp23 + tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1, arg2_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 2304, 4096), (9437184, 4096, 1)) + assert_size_stride(arg1_1, (1, 1, 4096), (12288, 12288, 1)) + assert_size_stride(arg2_1, (1, 1, 4096), (12288, 12288, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf3 = empty_strided_cuda((1, 2304, 4096), (9437184, 4096, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [norm_hidden_states, add, mul, norm_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul] + stream0 = get_raw_stream(0) + triton_red_fused_add_mul_native_layer_norm_0.run(arg0_1, arg1_1, arg2_1, buf3, 2304, 4096, stream=stream0) + del arg0_1 + del arg1_1 + del arg2_1 + return (buf3, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 2304, 4096), (9437184, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((1, 1, 4096), (12288, 12288, 1), device='cuda:0', dtype=torch.bfloat16) + arg2_1 = rand_strided((1, 1, 4096), (12288, 12288, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1, arg1_1, arg2_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/od/cod27kpzzzmh5ox7u6e6gvblzejrgxtrpbq2umbsyzgpvwzamxjx.py b/torchinductor/od/cod27kpzzzmh5ox7u6e6gvblzejrgxtrpbq2umbsyzgpvwzamxjx.py new file mode 100644 index 0000000000000000000000000000000000000000..99ab7d031726aa9c6576aabb9f9e0fbfc69fd3f3 --- /dev/null +++ b/torchinductor/od/cod27kpzzzmh5ox7u6e6gvblzejrgxtrpbq2umbsyzgpvwzamxjx.py @@ -0,0 +1,250 @@ +# AOT ID: ['26_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py +# Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention] +# Source node to ATen node mapping: +# k => clone_1 +# output => _scaled_dot_product_cudnn_attention +# permute => permute +# permute_1 => permute_1 +# permute_2 => permute_2 +# q => clone +# v => clone_2 +# Graph fragment: +# %arg0_1 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %permute : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg0_1, [0, 2, 1, 3]), kwargs = {}) +# %clone : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute,), kwargs = {memory_format: torch.contiguous_format}) +# %permute_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg1_1, [0, 2, 1, 3]), kwargs = {}) +# %clone_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_1,), kwargs = {memory_format: torch.contiguous_format}) +# %permute_2 : Tensor "bf16[1, 32, 2304, 128][2304*s73, 128, s73, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg3_1, [0, 2, 1, 3]), kwargs = {}) +# %clone_2 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_2,), kwargs = {memory_format: torch.contiguous_format}) +# %_scaled_dot_product_cudnn_attention : [num_users=1] = call_function[target=torch.ops.aten._scaled_dot_product_cudnn_attention.default](args = (%clone, %clone_1, %clone_2, None, False), kwargs = {}) +# return %buf0 +triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 = async_compile.triton('triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 128) + x1 = ((xindex // 128) % 2304) + x2 = xindex // 294912 + x3 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 4096*x1), None).to(tl.float32) + tl.store(out_ptr0 + (x3), tmp0, None) +''', device_str='cuda') + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py +# Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention] +# Source node to ATen node mapping: +# k => clone_1 +# output => _scaled_dot_product_cudnn_attention +# permute => permute +# permute_1 => permute_1 +# permute_2 => permute_2 +# q => clone +# v => clone_2 +# Graph fragment: +# %arg3_1 : Tensor "bf16[1, 2304, 32, 128][2304*s73, s73, 128, 1]cuda:0" = PlaceHolder[target=arg3_1] +# %permute : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg0_1, [0, 2, 1, 3]), kwargs = {}) +# %clone : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute,), kwargs = {memory_format: torch.contiguous_format}) +# %permute_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg1_1, [0, 2, 1, 3]), kwargs = {}) +# %clone_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_1,), kwargs = {memory_format: torch.contiguous_format}) +# %permute_2 : Tensor "bf16[1, 32, 2304, 128][2304*s73, 128, s73, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg3_1, [0, 2, 1, 3]), kwargs = {}) +# %clone_2 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_2,), kwargs = {memory_format: torch.contiguous_format}) +# %_scaled_dot_product_cudnn_attention : [num_users=1] = call_function[target=torch.ops.aten._scaled_dot_product_cudnn_attention.default](args = (%clone, %clone_1, %clone_2, None, False), kwargs = {}) +# return %buf2 +triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1 = async_compile.triton('triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i64', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 37748736}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(in_ptr0, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 128) + x1 = ((xindex // 128) % 2304) + x2 = xindex // 294912 + x3 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + ks0*x1), None).to(tl.float32) + tl.store(out_ptr0 + (x3), tmp0, None) +''', device_str='cuda') + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py +# Topologically Sorted Source Nodes: [permute_3, out], Original ATen: [aten.permute, aten.clone] +# Source node to ATen node mapping: +# out => clone_3 +# permute_3 => permute_3 +# Graph fragment: +# %getitem : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0" = PlaceHolder[target=getitem] +# %permute_3 : Tensor "bf16[1, 2304, 32, 128][9437184, 128, 294912, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%getitem, [0, 2, 1, 3]), kwargs = {}) +# %clone_3 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_3,), kwargs = {memory_format: torch.contiguous_format}) +# return %clone_3 +triton_poi_fused_clone_permute_2 = async_compile.triton('triton_poi_fused_clone_permute_2', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_permute_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_clone_permute_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 128) + x1 = ((xindex // 128) % 32) + x2 = xindex // 4096 + x3 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 294912*x1), None).to(tl.float32) + tl.store(out_ptr0 + (x3), tmp0, None) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1, arg2_1, arg3_1 = args + args.clear() + s73 = arg2_1 + assert_size_stride(arg0_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1)) + assert_size_stride(arg1_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1)) + assert_size_stride(arg3_1, (1, 2304, 32, 128), (2304*s73, s73, 128, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention] + stream0 = get_raw_stream(0) + triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.run(arg0_1, buf0, 9437184, stream=stream0) + del arg0_1 + buf1 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention] + stream0 = get_raw_stream(0) + triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.run(arg1_1, buf1, 9437184, stream=stream0) + del arg1_1 + buf2 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention] + stream0 = get_raw_stream(0) + triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.run(arg3_1, buf2, s73, 9437184, stream=stream0) + del arg3_1 + # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention] + buf3 = torch.ops.aten._scaled_dot_product_cudnn_attention.default(buf0, buf1, buf2, None, False) + del buf0 + del buf1 + buf4 = buf3[0] + assert_size_stride(buf4, (1, 32, 2304, 128), (9437184, 294912, 128, 1), 'torch.ops.aten._scaled_dot_product_cudnn_attention.default') + assert_alignment(buf4, 16, 'torch.ops.aten._scaled_dot_product_cudnn_attention.default') + del buf3 + buf8 = reinterpret_tensor(buf2, (1, 2304, 32, 128), (9437184, 4096, 128, 1), 0); del buf2 # reuse + # Topologically Sorted Source Nodes: [permute_3, out], Original ATen: [aten.permute, aten.clone] + stream0 = get_raw_stream(0) + triton_poi_fused_clone_permute_2.run(buf4, buf8, 9437184, stream=stream0) + del buf4 + return (buf8, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16) + arg2_1 = 36864 + arg3_1 = rand_strided((1, 2304, 32, 128), (84934656, 36864, 128, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/pg/8fb4988f5104615f769d2dd3f2407e6d419f6d4936f5f3f5ac38851533607a15.best_config b/torchinductor/pg/8fb4988f5104615f769d2dd3f2407e6d419f6d4936f5f3f5ac38851533607a15.best_config new file mode 100644 index 0000000000000000000000000000000000000000..76c99916a5947bdea0009b5df465a376f7040f79 --- /dev/null +++ b/torchinductor/pg/8fb4988f5104615f769d2dd3f2407e6d419f6d4936f5f3f5ac38851533607a15.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1, "R0_BLOCK": 4096, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 41, "triton_cache_hash": "U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A"} \ No newline at end of file diff --git a/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py b/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py new file mode 100644 index 0000000000000000000000000000000000000000..73c4d36f9ce26693ed156cf6ffe8986dad375879 --- /dev/null +++ b/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py @@ -0,0 +1,73 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 4096, 'r0_': 4096}, + reduction_hint=ReductionHint.INNER, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 56639488}} +) +@triton.jit +def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 2304 + r0_numel = 4096 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = xindex < xnumel + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = xindex + tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) + tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce( + tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0 + ) + tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean) + tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2) + tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight) + tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1) + tmp3 = tmp4[:, None] + tmp7 = tmp5[:, None] + tmp8 = tmp6[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp10 = 1.0 + tmp11 = tmp9 + tmp10 + tmp13 = tmp12.to(tl.float32) + tmp14 = tmp13 - tmp3 + tmp15 = 4096.0 + tmp16 = (tmp7 / tmp15) + tmp17 = 1e-06 + tmp18 = tmp16 + tmp17 + tmp19 = libdevice.rsqrt(tmp18) + tmp20 = tmp14 * tmp19 + tmp21 = tmp20.to(tl.float32) + tmp22 = tmp11 * tmp21 + tmp24 = tmp22 + tmp23 + tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask) diff --git a/torchinductor/pr/cprazirlfgitxz6pco2dilxlbw7te34agf7yp3dftuyge55edpsr.py b/torchinductor/pr/cprazirlfgitxz6pco2dilxlbw7te34agf7yp3dftuyge55edpsr.py new file mode 100644 index 0000000000000000000000000000000000000000..572568ff7523f5d2306706e8104a0276a0a90263 --- /dev/null +++ b/torchinductor/pr/cprazirlfgitxz6pco2dilxlbw7te34agf7yp3dftuyge55edpsr.py @@ -0,0 +1,203 @@ +# AOT ID: ['14_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py +# Topologically Sorted Source Nodes: [attn_output, hidden_states, norm_hidden_states, add_1, mul_1, norm_hidden_states_1], Original ATen: [aten.mul, aten.add, aten.native_layer_norm] +# Source node to ATen node mapping: +# add_1 => add_2 +# attn_output => mul +# hidden_states => add +# mul_1 => mul_2 +# norm_hidden_states => add_1, convert_element_type, convert_element_type_1, mul_1, rsqrt, sub, var_mean +# norm_hidden_states_1 => add_3 +# Graph fragment: +# %arg2_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg2_1] +# %arg0_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %arg1_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg1_1] +# %add : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=add] +# %getitem_1 : Tensor "f32[1, 2048, 1][2048, 1, 2048]cuda:0" = PlaceHolder[target=getitem_1] +# %buf2 : Tensor "f32[1, 2048, 1][2048, 1, 2048]cuda:0" = PlaceHolder[target=buf2] +# %arg3_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg3_1] +# %arg4_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg4_1] +# %mul : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg0_1, %arg1_1), kwargs = {}) +# %add : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg2_1, %mul), kwargs = {}) +# %convert_element_type : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add, torch.float32), kwargs = {}) +# %var_mean : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type, [2]), kwargs = {correction: 0, keepdim: True}) +# %sub : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type, %getitem_1), kwargs = {}) +# %add_1 : Tensor "f32[1, 2048, 1][2048, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem, 1e-06), kwargs = {}) +# %rsqrt : Tensor "f32[1, 2048, 1][2048, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_1,), kwargs = {}) +# %mul_1 : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, %rsqrt), kwargs = {}) +# %convert_element_type_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {}) +# %add_2 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg3_1, 1), kwargs = {}) +# %mul_2 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %add_2), kwargs = {}) +# %add_3 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_2, %arg4_1), kwargs = {}) +# return %add,%getitem_1,%buf2,%add_3 +triton_red_fused_add_mul_native_layer_norm_0 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 2048, 'r0_': 4096}, + reduction_hint=ReductionHint.INNER, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*bf16', 'out_ptr3': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 100687872}} +) +@triton.jit +def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 2048 + r0_numel = 4096 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = xindex < xnumel + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = xindex + tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp1 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp2 = tl.load(in_ptr2 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = tmp0 + tmp3 + tmp5 = tmp4.to(tl.float32) + tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK]) + tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce( + tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0 + ) + tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean) + tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2) + tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight) + tl.store(out_ptr0 + (r0_1 + 4096*x0), tmp4, r0_mask & xmask) + tmp8, tmp9, tmp10 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1) + tmp7 = tmp8[:, None] + tmp11 = tmp9[:, None] + tmp12 = tmp10[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp13 = tl.load(out_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp23 = tl.load(in_ptr3 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp27 = tl.load(in_ptr4 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp14 = tmp13.to(tl.float32) + tmp15 = tmp14 - tmp7 + tmp16 = 4096.0 + tmp17 = (tmp11 / tmp16) + tmp18 = 1e-06 + tmp19 = tmp17 + tmp18 + tmp20 = libdevice.rsqrt(tmp19) + tmp21 = tmp15 * tmp20 + tmp22 = tmp21.to(tl.float32) + tmp24 = 1.0 + tmp25 = tmp23 + tmp24 + tmp26 = tmp22 * tmp25 + tmp28 = tmp26 + tmp27 + tl.store(out_ptr3 + (r0_1 + 4096*x0), tmp28, r0_mask & xmask) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1, arg2_1, arg3_1, arg4_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 1, 4096), (24576, 24576, 1)) + assert_size_stride(arg1_1, (1, 2048, 4096), (8388608, 4096, 1)) + assert_size_stride(arg2_1, (1, 2048, 4096), (8388608, 4096, 1)) + assert_size_stride(arg3_1, (1, 1, 4096), (24576, 24576, 1)) + assert_size_stride(arg4_1, (1, 1, 4096), (24576, 24576, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((1, 2048, 4096), (8388608, 4096, 1), torch.bfloat16) + buf4 = empty_strided_cuda((1, 2048, 4096), (8388608, 4096, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [attn_output, hidden_states, norm_hidden_states, add_1, mul_1, norm_hidden_states_1], Original ATen: [aten.mul, aten.add, aten.native_layer_norm] + stream0 = get_raw_stream(0) + triton_red_fused_add_mul_native_layer_norm_0.run(arg2_1, arg0_1, arg1_1, arg3_1, arg4_1, buf0, buf4, 2048, 4096, stream=stream0) + del arg0_1 + del arg1_1 + del arg2_1 + del arg3_1 + del arg4_1 + return (buf4, buf0, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + arg2_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + arg3_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + arg4_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/py/cpymuollhpudssjtecoz4ymxklqd6cy3cy7ayt7z4vptnib66447.py b/torchinductor/py/cpymuollhpudssjtecoz4ymxklqd6cy3cy7ayt7z4vptnib66447.py new file mode 100644 index 0000000000000000000000000000000000000000..1341cf6f6209c5e0341d4ff07f5041d338661d0d --- /dev/null +++ b/torchinductor/py/cpymuollhpudssjtecoz4ymxklqd6cy3cy7ayt7z4vptnib66447.py @@ -0,0 +1,560 @@ +# AOT ID: ['8_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py +# Topologically Sorted Source Nodes: [encoder_query, encoder_query_1], Original ATen: [aten.view, aten._fused_rms_norm] +# Source node to ATen node mapping: +# encoder_query => view_3 +# encoder_query_1 => convert_element_type_4, mean_2, pow_3 +# Graph fragment: +# %arg5_1 : Tensor "bf16[1, 256, 4096][3145728, 12288, 1]cuda:0" = PlaceHolder[target=arg5_1] +# %view_3 : Tensor "bf16[1, 256, 32, 128][3145728, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg5_1, [1, 256, 32, 128]), kwargs = {}) +# %convert_element_type_4 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_3, torch.float32), kwargs = {}) +# %pow_3 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_4, 2), kwargs = {}) +# %mean_2 : Tensor "f32[1, 256, 32, 1][8192, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_3, [3], True), kwargs = {}) +# return %buf0 +triton_red_fused__fused_rms_norm_view_0 = async_compile.triton('triton_red_fused__fused_rms_norm_view_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 8192, 'r0_': 128}, + reduction_hint=ReductionHint.DEFAULT, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm_view_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 1, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 65536, 'r0_': 2097152}} +) +@triton.jit +def triton_red_fused__fused_rms_norm_view_0(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 8192 + r0_numel = 128 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = (xindex % 32) + x1 = xindex // 32 + _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) + x3 = xindex + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_2 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 12288*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tmp1 * tmp1 + tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) + tmp5 = _tmp4 + tmp3 + _tmp4 = tl.where(r0_mask, tmp5, _tmp4) + tmp4 = tl.sum(_tmp4, 1)[:, None] + tl.store(out_ptr0 + (x3), tmp4, None) +''', device_str='cuda') + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py +# Topologically Sorted Source Nodes: [query, query_1], Original ATen: [aten.view, aten._fused_rms_norm] +# Source node to ATen node mapping: +# query => view +# query_1 => convert_element_type, mean, pow_1 +# Graph fragment: +# %arg0_1 : Tensor "bf16[1, 2048, 4096][25165824, 12288, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %view : Tensor "bf16[1, 2048, 32, 128][25165824, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg0_1, [1, 2048, 32, 128]), kwargs = {}) +# %convert_element_type : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view, torch.float32), kwargs = {}) +# %pow_1 : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type, 2), kwargs = {}) +# %mean : Tensor "f32[1, 2048, 32, 1][65536, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_1, [3], True), kwargs = {}) +# return %buf1 +triton_red_fused__fused_rms_norm_view_1 = async_compile.triton('triton_red_fused__fused_rms_norm_view_1', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 65536, 'r0_': 128}, + reduction_hint=ReductionHint.DEFAULT, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm_view_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 1, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 524288, 'r0_': 16777216}} +) +@triton.jit +def triton_red_fused__fused_rms_norm_view_1(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 65536 + r0_numel = 128 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = (xindex % 32) + x1 = xindex // 32 + _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) + x3 = xindex + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_2 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 12288*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tmp1 * tmp1 + tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) + tmp5 = _tmp4 + tmp3 + _tmp4 = tl.where(r0_mask, tmp5, _tmp4) + tmp4 = tl.sum(_tmp4, 1)[:, None] + tl.store(out_ptr0 + (x3), tmp4, None) +''', device_str='cuda') + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py +# Topologically Sorted Source Nodes: [encoder_query, encoder_query_1, query, query_1, query_2], Original ATen: [aten.view, aten._fused_rms_norm, aten.cat] +# Source node to ATen node mapping: +# encoder_query => view_3 +# encoder_query_1 => add_2, convert_element_type_4, convert_element_type_5, mean_2, mul_4, mul_5, pow_3, rsqrt_2 +# query => view +# query_1 => add, convert_element_type, convert_element_type_1, mean, mul, mul_1, pow_1, rsqrt +# query_2 => cat +# Graph fragment: +# %arg5_1 : Tensor "bf16[1, 256, 4096][3145728, 12288, 1]cuda:0" = PlaceHolder[target=arg5_1] +# %buf0 : Tensor "f32[1, 256, 32, 1][8192, 32, 1, 8192]cuda:0" = PlaceHolder[target=buf0] +# %arg8_1 : Tensor "bf16[128][1]cuda:0" = PlaceHolder[target=arg8_1] +# %arg0_1 : Tensor "bf16[1, 2048, 4096][25165824, 12288, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %buf1 : Tensor "f32[1, 2048, 32, 1][65536, 32, 1, 65536]cuda:0" = PlaceHolder[target=buf1] +# %arg3_1 : Tensor "bf16[128][1]cuda:0" = PlaceHolder[target=arg3_1] +# %view_3 : Tensor "bf16[1, 256, 32, 128][3145728, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg5_1, [1, 256, 32, 128]), kwargs = {}) +# %convert_element_type_4 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_3, torch.float32), kwargs = {}) +# %pow_3 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_4, 2), kwargs = {}) +# %mean_2 : Tensor "f32[1, 256, 32, 1][8192, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_3, [3], True), kwargs = {}) +# %add_2 : Tensor "f32[1, 256, 32, 1][8192, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_2, 1e-06), kwargs = {}) +# %rsqrt_2 : Tensor "f32[1, 256, 32, 1][8192, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_2,), kwargs = {}) +# %mul_4 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_4, %rsqrt_2), kwargs = {}) +# %mul_5 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_4, %arg8_1), kwargs = {}) +# %convert_element_type_5 : Tensor "bf16[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_5, torch.bfloat16), kwargs = {}) +# %view : Tensor "bf16[1, 2048, 32, 128][25165824, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg0_1, [1, 2048, 32, 128]), kwargs = {}) +# %convert_element_type : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view, torch.float32), kwargs = {}) +# %pow_1 : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type, 2), kwargs = {}) +# %mean : Tensor "f32[1, 2048, 32, 1][65536, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_1, [3], True), kwargs = {}) +# %add : Tensor "f32[1, 2048, 32, 1][65536, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean, 1e-06), kwargs = {}) +# %rsqrt : Tensor "f32[1, 2048, 32, 1][65536, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {}) +# %mul : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %rsqrt), kwargs = {}) +# %mul_1 : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul, %arg3_1), kwargs = {}) +# %convert_element_type_1 : Tensor "bf16[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {}) +# %cat : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.aten.cat.default](args = ([%convert_element_type_5, %convert_element_type_1], 1), kwargs = {}) +# return %cat +triton_poi_fused__fused_rms_norm_cat_view_2 = async_compile.triton('triton_poi_fused__fused_rms_norm_cat_view_2', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'y': 131072, 'x': 128}, tile_hint=TileHint.DEFAULT, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*bf16', 'out_ptr0': '*bf16', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid2DWithYZOverflow', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__fused_rms_norm_cat_view_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'y': 589824, 'x': 75497984}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused__fused_rms_norm_cat_view_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 73728 + xnumel = 128 + yoffset = (tl.program_id(1) + tl.program_id(2) * tl.num_programs(1)) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + y1 = yindex // 32 + x2 = xindex + y0 = (yindex % 32) + y3 = yindex + tmp0 = y1 + tmp1 = tl.full([1, 1], 0, tl.int64) + tmp2 = tmp0 >= tmp1 + tmp3 = tl.full([1, 1], 256, tl.int64) + tmp4 = tmp0 < tmp3 + tmp5 = tl.load(in_ptr0 + (x2 + 128*y0 + 12288*(y1)), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp6 = tmp5.to(tl.float32) + tmp7 = tl.load(in_ptr1 + (tl.broadcast_to(y0 + 32*(y1), [YBLOCK, XBLOCK])), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0) + tmp8 = 128.0 + tmp9 = (tmp7 / tmp8) + tmp10 = 1e-06 + tmp11 = tmp9 + tmp10 + tmp12 = libdevice.rsqrt(tmp11) + tmp13 = tmp6 * tmp12 + tmp14 = tl.load(in_ptr2 + (tl.broadcast_to(x2, [YBLOCK, XBLOCK])), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp15 = tmp14.to(tl.float32) + tmp16 = tmp13 * tmp15 + tmp17 = tmp16.to(tl.float32) + tmp18 = tl.full(tmp17.shape, 0.0, tmp17.dtype) + tmp19 = tl.where(tmp4, tmp17, tmp18) + tmp20 = tmp0 >= tmp3 + tmp21 = tl.full([1, 1], 2304, tl.int64) + tmp22 = tmp0 < tmp21 + tmp23 = tl.load(in_ptr3 + (x2 + 128*y0 + 12288*((-256) + y1)), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp24 = tmp23.to(tl.float32) + tmp25 = tl.load(in_ptr4 + (tl.broadcast_to(y0 + 32*((-256) + y1), [YBLOCK, XBLOCK])), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0) + tmp26 = 128.0 + tmp27 = (tmp25 / tmp26) + tmp28 = 1e-06 + tmp29 = tmp27 + tmp28 + tmp30 = libdevice.rsqrt(tmp29) + tmp31 = tmp24 * tmp30 + tmp32 = tl.load(in_ptr5 + (tl.broadcast_to(x2, [YBLOCK, XBLOCK])), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp33 = tmp32.to(tl.float32) + tmp34 = tmp31 * tmp33 + tmp35 = tmp34.to(tl.float32) + tmp36 = tl.full(tmp35.shape, 0.0, tmp35.dtype) + tmp37 = tl.where(tmp20, tmp35, tmp36) + tmp38 = tl.where(tmp4, tmp19, tmp37) + tl.store(out_ptr0 + (x2 + 128*y3), tmp38, xmask & ymask) +''', device_str='cuda') + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py +# Topologically Sorted Source Nodes: [reshape, unbind, reshape_1, unbind_1, float_1, cos, mul, neg, stack, x_rotated, float_2, sin, mul_1, add, out, float_3, cos_2, mul_2, neg_1, stack_1, x_rotated_1, float_4, sin_2, mul_3, add_1, out_1], Original ATen: [aten.view, aten.unbind, aten._to_copy, aten.unsqueeze, aten.mul, aten.neg, aten.stack, aten.add] +# Source node to ATen node mapping: +# add => add_4 +# add_1 => add_5 +# cos => unsqueeze, unsqueeze_1 +# cos_2 => unsqueeze_6, unsqueeze_7 +# float_1 => convert_element_type_8 +# float_2 => convert_element_type_9 +# float_3 => convert_element_type_11 +# float_4 => convert_element_type_12 +# mul => mul_8 +# mul_1 => mul_9 +# mul_2 => mul_10 +# mul_3 => mul_11 +# neg => neg +# neg_1 => neg_1 +# out => convert_element_type_10 +# out_1 => convert_element_type_13 +# reshape => view_6 +# reshape_1 => view_8 +# sin => unsqueeze_2, unsqueeze_3 +# sin_2 => unsqueeze_8, unsqueeze_9 +# stack => cat_3, unsqueeze_4, unsqueeze_5 +# stack_1 => cat_4, unsqueeze_10, unsqueeze_11 +# unbind => unbind +# unbind_1 => unbind_1 +# x_rotated => view_7 +# x_rotated_1 => view_9 +# Graph fragment: +# %cat : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0" = PlaceHolder[target=cat] +# %arg10_1 : Tensor "f32[2304, 128][128, 1]cuda:0" = PlaceHolder[target=arg10_1] +# %arg11_1 : Tensor "f32[2304, 128][128, 1]cuda:0" = PlaceHolder[target=arg11_1] +# %cat_1 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0" = PlaceHolder[target=cat_1] +# %view_6 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat, [1, 2304, 32, -1, 2]), kwargs = {}) +# %unbind : [num_users=2] = call_function[target=torch.ops.aten.unbind.int](args = (%view_6, -1), kwargs = {}) +# %view_8 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_1, [1, 2304, 32, -1, 2]), kwargs = {}) +# %unbind_1 : [num_users=2] = call_function[target=torch.ops.aten.unbind.int](args = (%view_8, -1), kwargs = {}) +# %convert_element_type_8 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%cat, torch.float32), kwargs = {}) +# %unsqueeze : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg10_1, 0), kwargs = {}) +# %unsqueeze_1 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze, 2), kwargs = {}) +# %mul_8 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_8, %unsqueeze_1), kwargs = {}) +# %neg : Tensor "bf16[1, 2304, 32, 64][4718592, 2048, 64, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%getitem_1,), kwargs = {}) +# %unsqueeze_4 : Tensor "bf16[1, 2304, 32, 64, 1][4718592, 2048, 64, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%neg, 4), kwargs = {}) +# %unsqueeze_5 : Tensor "bf16[1, 2304, 32, 64, 1][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%getitem, 4), kwargs = {}) +# %cat_3 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_4, %unsqueeze_5], -1), kwargs = {}) +# %view_7 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_3, [1, 2304, 32, 128]), kwargs = {}) +# %convert_element_type_9 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_7, torch.float32), kwargs = {}) +# %unsqueeze_2 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg11_1, 0), kwargs = {}) +# %unsqueeze_3 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_2, 2), kwargs = {}) +# %mul_9 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_9, %unsqueeze_3), kwargs = {}) +# %add_4 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_8, %mul_9), kwargs = {}) +# %convert_element_type_10 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_4, torch.bfloat16), kwargs = {}) +# %convert_element_type_11 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%cat_1, torch.float32), kwargs = {}) +# %unsqueeze_6 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg10_1, 0), kwargs = {}) +# %unsqueeze_7 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_6, 2), kwargs = {}) +# %mul_10 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_11, %unsqueeze_7), kwargs = {}) +# %neg_1 : Tensor "bf16[1, 2304, 32, 64][4718592, 2048, 64, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%getitem_3,), kwargs = {}) +# %unsqueeze_10 : Tensor "bf16[1, 2304, 32, 64, 1][4718592, 2048, 64, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%neg_1, 4), kwargs = {}) +# %unsqueeze_11 : Tensor "bf16[1, 2304, 32, 64, 1][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%getitem_2, 4), kwargs = {}) +# %cat_4 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_10, %unsqueeze_11], -1), kwargs = {}) +# %view_9 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_4, [1, 2304, 32, 128]), kwargs = {}) +# %convert_element_type_12 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_9, torch.float32), kwargs = {}) +# %unsqueeze_8 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg11_1, 0), kwargs = {}) +# %unsqueeze_9 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_8, 2), kwargs = {}) +# %mul_11 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_12, %unsqueeze_9), kwargs = {}) +# %add_5 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_10, %mul_11), kwargs = {}) +# %convert_element_type_13 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_5, torch.bfloat16), kwargs = {}) +# return %convert_element_type_10,%convert_element_type_13 +triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3 = async_compile.triton('triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 8, 'num_store': 2, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 115605504}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x3 = xindex + x0 = (xindex % 128) + x2 = xindex // 4096 + x4 = xindex // 128 + tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32) + tmp2 = tl.load(in_ptr1 + (x0 + 128*x2), None, eviction_policy='evict_last') + tmp19 = tl.load(in_ptr2 + (x0 + 128*x2), None, eviction_policy='evict_last') + tmp23 = tl.load(in_ptr3 + (x3), None).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = (x3 % 2) + tmp5 = tl.full([1], 0, tl.int64) + tmp6 = tmp4 >= tmp5 + tmp7 = tl.full([1], 1, tl.int64) + tmp8 = tmp4 < tmp7 + tmp9 = tl.load(in_ptr0 + (1 + 2*(x0 // 2) + 128*x4), tmp8, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp10 = -tmp9 + tmp11 = tl.full(tmp10.shape, 0.0, tmp10.dtype) + tmp12 = tl.where(tmp8, tmp10, tmp11) + tmp13 = tmp4 >= tmp7 + tmp14 = tl.full([1], 2, tl.int64) + tmp15 = tmp4 < tmp14 + tmp16 = tl.load(in_ptr0 + (2*(x0 // 2) + 128*x4), tmp13, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp17 = tl.where(tmp8, tmp12, tmp16) + tmp18 = tmp17.to(tl.float32) + tmp20 = tmp18 * tmp19 + tmp21 = tmp3 + tmp20 + tmp22 = tmp21.to(tl.float32) + tmp24 = tmp23.to(tl.float32) + tmp25 = tmp24 * tmp2 + tmp26 = tl.load(in_ptr3 + (1 + 2*(x0 // 2) + 128*x4), tmp8, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp27 = -tmp26 + tmp28 = tl.full(tmp27.shape, 0.0, tmp27.dtype) + tmp29 = tl.where(tmp8, tmp27, tmp28) + tmp30 = tl.load(in_ptr3 + (2*(x0 // 2) + 128*x4), tmp13, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp31 = tl.where(tmp8, tmp29, tmp30) + tmp32 = tmp31.to(tl.float32) + tmp33 = tmp32 * tmp19 + tmp34 = tmp25 + tmp33 + tmp35 = tmp34.to(tl.float32) + tl.store(out_ptr0 + (x3), tmp22, None) + tl.store(out_ptr1 + (x3), tmp35, None) +''', device_str='cuda') + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py +# Topologically Sorted Source Nodes: [encoder_value, value, value_1], Original ATen: [aten.view, aten.cat] +# Source node to ATen node mapping: +# encoder_value => view_5 +# value => view_2 +# value_1 => cat_2 +# Graph fragment: +# %arg7_1 : Tensor "bf16[1, 256, 4096][3145728, 12288, 1]cuda:0" = PlaceHolder[target=arg7_1] +# %arg2_1 : Tensor "bf16[1, 2048, 4096][25165824, 12288, 1]cuda:0" = PlaceHolder[target=arg2_1] +# %view_5 : Tensor "bf16[1, 256, 32, 128][3145728, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg7_1, [1, 256, 32, 128]), kwargs = {}) +# %view_2 : Tensor "bf16[1, 2048, 32, 128][25165824, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg2_1, [1, 2048, 32, 128]), kwargs = {}) +# %cat_2 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%view_5, %view_2], 1), kwargs = {}) +# return %cat_2 +triton_poi_fused_cat_view_4 = async_compile.triton('triton_poi_fused_cat_view_4', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_view_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 75497472}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_cat_view_4(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x1 = xindex // 4096 + x0 = (xindex % 4096) + x2 = xindex + tmp0 = x1 + tmp1 = tl.full([1], 0, tl.int64) + tmp2 = tmp0 >= tmp1 + tmp3 = tl.full([1], 256, tl.int64) + tmp4 = tmp0 < tmp3 + tmp5 = tl.load(in_ptr0 + (x0 + 12288*(x1)), tmp4, other=0.0).to(tl.float32) + tmp6 = tmp0 >= tmp3 + tmp7 = tl.full([1], 2304, tl.int64) + tmp8 = tmp0 < tmp7 + tmp9 = tl.load(in_ptr1 + (x0 + 12288*((-256) + x1)), tmp6, other=0.0).to(tl.float32) + tmp10 = tl.where(tmp4, tmp5, tmp9) + tl.store(out_ptr0 + (x2), tmp10, None) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 2048, 4096), (25165824, 12288, 1)) + assert_size_stride(arg1_1, (1, 2048, 4096), (25165824, 12288, 1)) + assert_size_stride(arg2_1, (1, 2048, 4096), (25165824, 12288, 1)) + assert_size_stride(arg3_1, (128, ), (1, )) + assert_size_stride(arg4_1, (128, ), (1, )) + assert_size_stride(arg5_1, (1, 256, 4096), (3145728, 12288, 1)) + assert_size_stride(arg6_1, (1, 256, 4096), (3145728, 12288, 1)) + assert_size_stride(arg7_1, (1, 256, 4096), (3145728, 12288, 1)) + assert_size_stride(arg8_1, (128, ), (1, )) + assert_size_stride(arg9_1, (128, ), (1, )) + assert_size_stride(arg10_1, (2304, 128), (128, 1)) + assert_size_stride(arg11_1, (2304, 128), (128, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((1, 256, 32, 1), (8192, 32, 1, 8192), torch.float32) + # Topologically Sorted Source Nodes: [encoder_query, encoder_query_1], Original ATen: [aten.view, aten._fused_rms_norm] + stream0 = get_raw_stream(0) + triton_red_fused__fused_rms_norm_view_0.run(arg5_1, buf0, 8192, 128, stream=stream0) + buf1 = empty_strided_cuda((1, 2048, 32, 1), (65536, 32, 1, 65536), torch.float32) + # Topologically Sorted Source Nodes: [query, query_1], Original ATen: [aten.view, aten._fused_rms_norm] + stream0 = get_raw_stream(0) + triton_red_fused__fused_rms_norm_view_1.run(arg0_1, buf1, 65536, 128, stream=stream0) + buf2 = empty_strided_cuda((1, 2304, 32, 128), (9437184, 4096, 128, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [encoder_query, encoder_query_1, query, query_1, query_2], Original ATen: [aten.view, aten._fused_rms_norm, aten.cat] + stream0 = get_raw_stream(0) + triton_poi_fused__fused_rms_norm_cat_view_2.run(arg5_1, buf0, arg8_1, arg0_1, buf1, arg3_1, buf2, 73728, 128, stream=stream0) + del arg0_1 + del arg3_1 + del arg5_1 + del arg8_1 + buf3 = buf0; del buf0 # reuse + # Topologically Sorted Source Nodes: [encoder_key, encoder_key_1], Original ATen: [aten.view, aten._fused_rms_norm] + stream0 = get_raw_stream(0) + triton_red_fused__fused_rms_norm_view_0.run(arg6_1, buf3, 8192, 128, stream=stream0) + buf4 = buf1; del buf1 # reuse + # Topologically Sorted Source Nodes: [key, key_1], Original ATen: [aten.view, aten._fused_rms_norm] + stream0 = get_raw_stream(0) + triton_red_fused__fused_rms_norm_view_1.run(arg1_1, buf4, 65536, 128, stream=stream0) + buf5 = empty_strided_cuda((1, 2304, 32, 128), (9437184, 4096, 128, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [encoder_key, encoder_key_1, key, key_1, key_2], Original ATen: [aten.view, aten._fused_rms_norm, aten.cat] + stream0 = get_raw_stream(0) + triton_poi_fused__fused_rms_norm_cat_view_2.run(arg6_1, buf3, arg9_1, arg1_1, buf4, arg4_1, buf5, 73728, 128, stream=stream0) + del arg1_1 + del arg4_1 + del arg6_1 + del arg9_1 + del buf3 + del buf4 + buf6 = empty_strided_cuda((1, 2304, 32, 128), (9437184, 4096, 128, 1), torch.bfloat16) + buf7 = empty_strided_cuda((1, 2304, 32, 128), (9437184, 4096, 128, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [reshape, unbind, reshape_1, unbind_1, float_1, cos, mul, neg, stack, x_rotated, float_2, sin, mul_1, add, out, float_3, cos_2, mul_2, neg_1, stack_1, x_rotated_1, float_4, sin_2, mul_3, add_1, out_1], Original ATen: [aten.view, aten.unbind, aten._to_copy, aten.unsqueeze, aten.mul, aten.neg, aten.stack, aten.add] + stream0 = get_raw_stream(0) + triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.run(buf2, arg10_1, arg11_1, buf5, buf6, buf7, 9437184, stream=stream0) + del arg10_1 + del arg11_1 + del buf2 + buf8 = buf5; del buf5 # reuse + # Topologically Sorted Source Nodes: [encoder_value, value, value_1], Original ATen: [aten.view, aten.cat] + stream0 = get_raw_stream(0) + triton_poi_fused_cat_view_4.run(arg7_1, arg2_1, buf8, 9437184, stream=stream0) + del arg2_1 + del arg7_1 + return (buf6, buf7, buf8, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 2048, 4096), (25165824, 12288, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((1, 2048, 4096), (25165824, 12288, 1), device='cuda:0', dtype=torch.bfloat16) + arg2_1 = rand_strided((1, 2048, 4096), (25165824, 12288, 1), device='cuda:0', dtype=torch.bfloat16) + arg3_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16) + arg4_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16) + arg5_1 = rand_strided((1, 256, 4096), (3145728, 12288, 1), device='cuda:0', dtype=torch.bfloat16) + arg6_1 = rand_strided((1, 256, 4096), (3145728, 12288, 1), device='cuda:0', dtype=torch.bfloat16) + arg7_1 = rand_strided((1, 256, 4096), (3145728, 12288, 1), device='cuda:0', dtype=torch.bfloat16) + arg8_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16) + arg9_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16) + arg10_1 = rand_strided((2304, 128), (128, 1), device='cuda:0', dtype=torch.float32) + arg11_1 = rand_strided((2304, 128), (128, 1), device='cuda:0', dtype=torch.float32) + fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/qi/43ec9cea2ff86c63c13b7d3cc01865c162876223bb0311f7a189de47919626e8.best_config b/torchinductor/qi/43ec9cea2ff86c63c13b7d3cc01865c162876223bb0311f7a189de47919626e8.best_config new file mode 100644 index 0000000000000000000000000000000000000000..8ea64e593bba4bd1abf52810c7324083f8b86ade --- /dev/null +++ b/torchinductor/qi/43ec9cea2ff86c63c13b7d3cc01865c162876223bb0311f7a189de47919626e8.best_config @@ -0,0 +1 @@ +{"XBLOCK": 64, "R0_BLOCK": 64, "num_warps": 16, "num_stages": 1, "configs_hash": "6ffa43f2ca8cb1499f3ff3fbf8c975f2c07eef9b57fcecda113029ab12cbef66", "found_by_coordesc": false, "time_taken_ms": 116, "triton_cache_hash": "7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA"} \ No newline at end of file diff --git a/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py b/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py new file mode 100644 index 0000000000000000000000000000000000000000..fabc882f8d8b27e7850e810b8645af4c260c6187 --- /dev/null +++ b/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py @@ -0,0 +1,45 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 65536, 'r0_': 128}, + reduction_hint=ReductionHint.DEFAULT, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm_view_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 1, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 524288, 'r0_': 16777216}} +) +@triton.jit +def triton_red_fused__fused_rms_norm_view_1(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 65536 + r0_numel = 128 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = (xindex % 32) + x1 = xindex // 32 + _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) + x3 = xindex + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_2 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 12288*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tmp1 * tmp1 + tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) + tmp5 = _tmp4 + tmp3 + _tmp4 = tl.where(r0_mask, tmp5, _tmp4) + tmp4 = tl.sum(_tmp4, 1)[:, None] + tl.store(out_ptr0 + (x3), tmp4, None) diff --git a/torchinductor/qo/cqo2iwjnshnag7tcsvqf4zqyfi65lsg3hg6ckh7lrd3p4rff3m4j.py b/torchinductor/qo/cqo2iwjnshnag7tcsvqf4zqyfi65lsg3hg6ckh7lrd3p4rff3m4j.py new file mode 100644 index 0000000000000000000000000000000000000000..adf40a9f9b663a683b7992b0140c2d72b433da97 --- /dev/null +++ b/torchinductor/qo/cqo2iwjnshnag7tcsvqf4zqyfi65lsg3hg6ckh7lrd3p4rff3m4j.py @@ -0,0 +1,129 @@ +# AOT ID: ['16_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py +# Topologically Sorted Source Nodes: [chunk, silu, x], Original ATen: [aten.split, aten.silu, aten.mul] +# Source node to ATen node mapping: +# chunk => split +# silu => convert_element_type, convert_element_type_1, mul, sigmoid +# x => mul_1 +# Graph fragment: +# %arg0_1 : Tensor "bf16[1, 2048, 24576][50331648, 24576, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %split : [num_users=2] = call_function[target=torch.ops.aten.split.Tensor](args = (%arg0_1, 12288, -1), kwargs = {}) +# %convert_element_type : Tensor "f32[1, 2048, 12288][25165824, 12288, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem, torch.float32), kwargs = {}) +# %sigmoid : Tensor "f32[1, 2048, 12288][25165824, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type,), kwargs = {}) +# %mul : Tensor "f32[1, 2048, 12288][25165824, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %sigmoid), kwargs = {}) +# %convert_element_type_1 : Tensor "bf16[1, 2048, 12288][25165824, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {}) +# %mul_1 : Tensor "bf16[1, 2048, 12288][25165824, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %getitem_1), kwargs = {}) +# return %mul_1 +triton_poi_fused_mul_silu_split_0 = async_compile.triton('triton_poi_fused_mul_silu_split_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 33554432}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_split_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 201326592}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_mul_silu_split_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 25165824 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 12288) + x1 = xindex // 12288 + x2 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 24576*x1), None).to(tl.float32) + tmp5 = tl.load(in_ptr0 + (12288 + x0 + 24576*x1), None).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tl.sigmoid(tmp1) + tmp3 = tmp1 * tmp2 + tmp4 = tmp3.to(tl.float32) + tmp6 = tmp4 * tmp5 + tl.store(out_ptr0 + (x2), tmp6, None) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, = args + args.clear() + assert_size_stride(arg0_1, (1, 2048, 24576), (50331648, 24576, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((1, 2048, 12288), (25165824, 12288, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [chunk, silu, x], Original ATen: [aten.split, aten.silu, aten.mul] + stream0 = get_raw_stream(0) + triton_poi_fused_mul_silu_split_0.run(arg0_1, buf0, 25165824, stream=stream0) + del arg0_1 + return (buf0, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 2048, 24576), (50331648, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/r4/6028a425a75f0721e0fdad1d9db266ea37b002463e12788ce8a68ad2891bcf49.best_config b/torchinductor/r4/6028a425a75f0721e0fdad1d9db266ea37b002463e12788ce8a68ad2891bcf49.best_config new file mode 100644 index 0000000000000000000000000000000000000000..34767cd30c8472ccb4470e1cbfe06f14f75fa580 --- /dev/null +++ b/torchinductor/r4/6028a425a75f0721e0fdad1d9db266ea37b002463e12788ce8a68ad2891bcf49.best_config @@ -0,0 +1 @@ +{"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 43, "triton_cache_hash": "X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ"} \ No newline at end of file diff --git a/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py b/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py new file mode 100644 index 0000000000000000000000000000000000000000..4ea2f5a5befa39b1ed1cc85d2a4d3ff6a5ef5dfc --- /dev/null +++ b/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py @@ -0,0 +1,28 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_permute_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_clone_permute_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 128) + x1 = ((xindex // 128) % 32) + x2 = xindex // 4096 + x3 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 294912*x1), None).to(tl.float32) + tl.store(out_ptr0 + (x3), tmp0, None) diff --git a/torchinductor/s7/cs7ssipzow4rpjc35xc3dleet56e5bkdlxrbyh5qbhp6vzfhfqpg.py b/torchinductor/s7/cs7ssipzow4rpjc35xc3dleet56e5bkdlxrbyh5qbhp6vzfhfqpg.py new file mode 100644 index 0000000000000000000000000000000000000000..9b290100d32b9fe6dda9b7350dbaefcf4b163001 --- /dev/null +++ b/torchinductor/s7/cs7ssipzow4rpjc35xc3dleet56e5bkdlxrbyh5qbhp6vzfhfqpg.py @@ -0,0 +1,69 @@ +# AOT ID: ['19_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1 = args + args.clear() + assert_size_stride(arg0_1, (12288, 4096), (1, 12288)) + assert_size_stride(arg1_1, (1, 1), (1, 1)) + return (aten.view.dtype(reinterpret_tensor(arg0_1, (4096, 12288), (12288, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((12288, 4096), (1, 12288), device='cuda:0', dtype=torch.float8_e4m3fn) + arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32) + fn = lambda: call([arg0_1, arg1_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py b/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py new file mode 100644 index 0000000000000000000000000000000000000000..701e65bf90b13cc0feabb549526bd21e1c720221 --- /dev/null +++ b/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py @@ -0,0 +1,32 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 4194304}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_split_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_mul_silu_split_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 12288) + x1 = xindex // 12288 + x2 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 24576*x1), None).to(tl.float32) + tmp5 = tl.load(in_ptr0 + (12288 + x0 + 24576*x1), None).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tl.sigmoid(tmp1) + tmp3 = tmp1 * tmp2 + tmp4 = tmp3.to(tl.float32) + tmp6 = tmp4 * tmp5 + tl.store(out_ptr0 + (x2), tmp6, None) diff --git a/torchinductor/sy/fab59b692a7cc2d74a6b7044a7f84ec2ff8e1f7b3f0b76089865ebc72c0cdb5e.best_config b/torchinductor/sy/fab59b692a7cc2d74a6b7044a7f84ec2ff8e1f7b3f0b76089865ebc72c0cdb5e.best_config new file mode 100644 index 0000000000000000000000000000000000000000..95a3dc12e5ccc24885a84327e17db66fc3f7e791 --- /dev/null +++ b/torchinductor/sy/fab59b692a7cc2d74a6b7044a7f84ec2ff8e1f7b3f0b76089865ebc72c0cdb5e.best_config @@ -0,0 +1 @@ +{"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 38, "triton_cache_hash": "THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA"} \ No newline at end of file diff --git a/torchinductor/tu/ctulszk4mmiqip66rpuyd3jhbqxphwudytcgr5dwumqowwaq7mtb.py b/torchinductor/tu/ctulszk4mmiqip66rpuyd3jhbqxphwudytcgr5dwumqowwaq7mtb.py new file mode 100644 index 0000000000000000000000000000000000000000..626bb27ff8f752f0961d5ea2053d46128fff9c31 --- /dev/null +++ b/torchinductor/tu/ctulszk4mmiqip66rpuyd3jhbqxphwudytcgr5dwumqowwaq7mtb.py @@ -0,0 +1,69 @@ +# AOT ID: ['28_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1 = args + args.clear() + assert_size_stride(arg0_1, (16384, 4096), (1, 16384)) + assert_size_stride(arg1_1, (1, 1), (1, 1)) + return (aten.view.dtype(reinterpret_tensor(arg0_1, (4096, 16384), (16384, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((16384, 4096), (1, 16384), device='cuda:0', dtype=torch.float8_e4m3fn) + arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32) + fn = lambda: call([arg0_1, arg1_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/uv/cuviukt3lq3axnbe5hpjzwmyxahv35wielkqnmdck4fg7m2bd6kz.py b/torchinductor/uv/cuviukt3lq3axnbe5hpjzwmyxahv35wielkqnmdck4fg7m2bd6kz.py new file mode 100644 index 0000000000000000000000000000000000000000..77b9afb61558a36ce68d395d1c84d0f67789e1a6 --- /dev/null +++ b/torchinductor/uv/cuviukt3lq3axnbe5hpjzwmyxahv35wielkqnmdck4fg7m2bd6kz.py @@ -0,0 +1,69 @@ +# AOT ID: ['15_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1 = args + args.clear() + assert_size_stride(arg0_1, (4096, 24576), (1, 4096)) + assert_size_stride(arg1_1, (1, 1), (1, 1)) + return (aten.view.dtype(reinterpret_tensor(arg0_1, (24576, 4096), (4096, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((4096, 24576), (1, 4096), device='cuda:0', dtype=torch.float8_e4m3fn) + arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32) + fn = lambda: call([arg0_1, arg1_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/vv/cvvhb6mmoaw3lk4z2krosoew2gyomjg7q2lg2omxzpz5kkx6rf2i.py b/torchinductor/vv/cvvhb6mmoaw3lk4z2krosoew2gyomjg7q2lg2omxzpz5kkx6rf2i.py new file mode 100644 index 0000000000000000000000000000000000000000..be5ddf0fc35056e2564d7fc26a9388cf18f2873f --- /dev/null +++ b/torchinductor/vv/cvvhb6mmoaw3lk4z2krosoew2gyomjg7q2lg2omxzpz5kkx6rf2i.py @@ -0,0 +1,129 @@ +# AOT ID: ['22_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py +# Topologically Sorted Source Nodes: [mul, encoder_hidden_states], Original ATen: [aten.mul, aten.add] +# Source node to ATen node mapping: +# encoder_hidden_states => add +# mul => mul +# Graph fragment: +# %arg2_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=arg2_1] +# %arg0_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %arg1_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=arg1_1] +# %mul : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg0_1, %arg1_1), kwargs = {}) +# %add : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg2_1, %mul), kwargs = {}) +# return %add +triton_poi_fused_add_mul_0 = async_compile.triton('triton_poi_fused_add_mul_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 1048576}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 8396800}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_add_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 1048576 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x2 = xindex + x0 = (xindex % 4096) + tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) + tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32) + tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = tmp0 + tmp3 + tl.store(out_ptr0 + (x2), tmp4, None) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1, arg2_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 1, 4096), (24576, 24576, 1)) + assert_size_stride(arg1_1, (1, 256, 4096), (1048576, 4096, 1)) + assert_size_stride(arg2_1, (1, 256, 4096), (1048576, 4096, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((1, 256, 4096), (1048576, 4096, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [mul, encoder_hidden_states], Original ATen: [aten.mul, aten.add] + stream0 = get_raw_stream(0) + triton_poi_fused_add_mul_0.run(arg2_1, arg0_1, arg1_1, buf0, 1048576, stream=stream0) + del arg0_1 + del arg1_1 + del arg2_1 + return (buf0, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((1, 256, 4096), (1048576, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + arg2_1 = rand_strided((1, 256, 4096), (1048576, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1, arg1_1, arg2_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/vw/cvw7uqdyk4vwvsddaezbct3gzsp7n42ycsripavamngotuxavr4d.py b/torchinductor/vw/cvw7uqdyk4vwvsddaezbct3gzsp7n42ycsripavamngotuxavr4d.py new file mode 100644 index 0000000000000000000000000000000000000000..1619b1c5ffa127cf3d0a084667d70cf0782f5e64 --- /dev/null +++ b/torchinductor/vw/cvw7uqdyk4vwvsddaezbct3gzsp7n42ycsripavamngotuxavr4d.py @@ -0,0 +1,197 @@ +# AOT ID: ['9_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py +# Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention] +# Source node to ATen node mapping: +# k => clone_1 +# output => _scaled_dot_product_cudnn_attention +# permute => permute +# permute_1 => permute_1 +# permute_2 => permute_2 +# q => clone +# v => clone_2 +# Graph fragment: +# %arg0_1 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %permute : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg0_1, [0, 2, 1, 3]), kwargs = {}) +# %clone : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute,), kwargs = {memory_format: torch.contiguous_format}) +# %permute_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg1_1, [0, 2, 1, 3]), kwargs = {}) +# %clone_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_1,), kwargs = {memory_format: torch.contiguous_format}) +# %permute_2 : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg2_1, [0, 2, 1, 3]), kwargs = {}) +# %clone_2 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_2,), kwargs = {memory_format: torch.contiguous_format}) +# %_scaled_dot_product_cudnn_attention : [num_users=1] = call_function[target=torch.ops.aten._scaled_dot_product_cudnn_attention.default](args = (%clone, %clone_1, %clone_2, None, False), kwargs = {}) +# return %buf0 +triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 = async_compile.triton('triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 128) + x1 = ((xindex // 128) % 2304) + x2 = xindex // 294912 + x3 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 4096*x1), None).to(tl.float32) + tl.store(out_ptr0 + (x3), tmp0, None) +''', device_str='cuda') + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py +# Topologically Sorted Source Nodes: [permute_3, out], Original ATen: [aten.permute, aten.clone] +# Source node to ATen node mapping: +# out => clone_3 +# permute_3 => permute_3 +# Graph fragment: +# %getitem : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0" = PlaceHolder[target=getitem] +# %permute_3 : Tensor "bf16[1, 2304, 32, 128][9437184, 128, 294912, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%getitem, [0, 2, 1, 3]), kwargs = {}) +# %clone_3 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_3,), kwargs = {memory_format: torch.contiguous_format}) +# return %clone_3 +triton_poi_fused_clone_permute_1 = async_compile.triton('triton_poi_fused_clone_permute_1', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_permute_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_clone_permute_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 128) + x1 = ((xindex // 128) % 32) + x2 = xindex // 4096 + x3 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 294912*x1), None).to(tl.float32) + tl.store(out_ptr0 + (x3), tmp0, None) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1, arg2_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1)) + assert_size_stride(arg1_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1)) + assert_size_stride(arg2_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention] + stream0 = get_raw_stream(0) + triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.run(arg0_1, buf0, 9437184, stream=stream0) + del arg0_1 + buf1 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention] + stream0 = get_raw_stream(0) + triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.run(arg1_1, buf1, 9437184, stream=stream0) + del arg1_1 + buf2 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention] + stream0 = get_raw_stream(0) + triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.run(arg2_1, buf2, 9437184, stream=stream0) + del arg2_1 + # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention] + buf3 = torch.ops.aten._scaled_dot_product_cudnn_attention.default(buf0, buf1, buf2, None, False) + del buf0 + del buf1 + buf4 = buf3[0] + assert_size_stride(buf4, (1, 32, 2304, 128), (9437184, 294912, 128, 1), 'torch.ops.aten._scaled_dot_product_cudnn_attention.default') + assert_alignment(buf4, 16, 'torch.ops.aten._scaled_dot_product_cudnn_attention.default') + del buf3 + buf8 = reinterpret_tensor(buf2, (1, 2304, 32, 128), (9437184, 4096, 128, 1), 0); del buf2 # reuse + # Topologically Sorted Source Nodes: [permute_3, out], Original ATen: [aten.permute, aten.clone] + stream0 = get_raw_stream(0) + triton_poi_fused_clone_permute_1.run(buf4, buf8, 9437184, stream=stream0) + del buf4 + return (buf8, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16) + arg2_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1, arg1_1, arg2_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py b/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py new file mode 100644 index 0000000000000000000000000000000000000000..d31889c745af06fcb68dc4eb9b5dfbef10d3c301 --- /dev/null +++ b/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py @@ -0,0 +1,78 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 2048, 'r0_': 4096}, + reduction_hint=ReductionHint.INNER, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*bf16', 'out_ptr3': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 100687872}} +) +@triton.jit +def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 2048 + r0_numel = 4096 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = xindex < xnumel + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = xindex + tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp1 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp2 = tl.load(in_ptr2 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = tmp0 + tmp3 + tmp5 = tmp4.to(tl.float32) + tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK]) + tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce( + tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0 + ) + tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean) + tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2) + tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight) + tl.store(out_ptr0 + (r0_1 + 4096*x0), tmp4, r0_mask & xmask) + tmp8, tmp9, tmp10 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1) + tmp7 = tmp8[:, None] + tmp11 = tmp9[:, None] + tmp12 = tmp10[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp13 = tl.load(out_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp23 = tl.load(in_ptr3 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp27 = tl.load(in_ptr4 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp14 = tmp13.to(tl.float32) + tmp15 = tmp14 - tmp7 + tmp16 = 4096.0 + tmp17 = (tmp11 / tmp16) + tmp18 = 1e-06 + tmp19 = tmp17 + tmp18 + tmp20 = libdevice.rsqrt(tmp19) + tmp21 = tmp15 * tmp20 + tmp22 = tmp21.to(tl.float32) + tmp24 = 1.0 + tmp25 = tmp23 + tmp24 + tmp26 = tmp22 * tmp25 + tmp28 = tmp26 + tmp27 + tl.store(out_ptr3 + (r0_1 + 4096*x0), tmp28, r0_mask & xmask) diff --git a/torchinductor/w3/f47f62f438f942996b5cf11eb19c6e256d1ed91f8e9d6804e5f718dcdfe8080f.best_config b/torchinductor/w3/f47f62f438f942996b5cf11eb19c6e256d1ed91f8e9d6804e5f718dcdfe8080f.best_config new file mode 100644 index 0000000000000000000000000000000000000000..d23168327bf310e3204332f9c95ed66190063ddc --- /dev/null +++ b/torchinductor/w3/f47f62f438f942996b5cf11eb19c6e256d1ed91f8e9d6804e5f718dcdfe8080f.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1, "R0_BLOCK": 2048, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 49, "triton_cache_hash": "DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA"} \ No newline at end of file diff --git a/torchinductor/wc/cwcymlhbzenwwbbudgealzjyh42evsv2zpmylwljovxkosebwbpk.py b/torchinductor/wc/cwcymlhbzenwwbbudgealzjyh42evsv2zpmylwljovxkosebwbpk.py new file mode 100644 index 0000000000000000000000000000000000000000..073acdaa9018085bb28c673c0e387da3a5442912 --- /dev/null +++ b/torchinductor/wc/cwcymlhbzenwwbbudgealzjyh42evsv2zpmylwljovxkosebwbpk.py @@ -0,0 +1,67 @@ +# AOT ID: ['6_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, = args + args.clear() + s52 = arg0_1 + return (4096*s52, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = 256 + fn = lambda: call([arg0_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/wv/46eff4b043eb1561ef5f4e5311f476ba8ecf12b7c030a45f10b615fff48ee754.best_config b/torchinductor/wv/46eff4b043eb1561ef5f4e5311f476ba8ecf12b7c030a45f10b615fff48ee754.best_config new file mode 100644 index 0000000000000000000000000000000000000000..0b77f7cd0905e2efaa3d56bcc591ab9eb952ae8c --- /dev/null +++ b/torchinductor/wv/46eff4b043eb1561ef5f4e5311f476ba8ecf12b7c030a45f10b615fff48ee754.best_config @@ -0,0 +1 @@ +{"XBLOCK": 4, "R0_BLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "f874ed6abc48e5e95ac45a4a098cc27fc009d3b1219d27438179d79ebfae2c22", "found_by_coordesc": false, "time_taken_ms": 103, "triton_cache_hash": "3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ"} \ No newline at end of file diff --git a/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py b/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py new file mode 100644 index 0000000000000000000000000000000000000000..6b4c48a33613d7561a4418911bd0b9db96d61f81 --- /dev/null +++ b/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py @@ -0,0 +1,45 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 8192, 'r0_': 128}, + reduction_hint=ReductionHint.DEFAULT, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm_view_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 1, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 65536, 'r0_': 2097152}} +) +@triton.jit +def triton_red_fused__fused_rms_norm_view_0(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 8192 + r0_numel = 128 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = (xindex % 32) + x1 = xindex // 32 + _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) + x3 = xindex + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_2 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 12288*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tmp1 * tmp1 + tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) + tmp5 = _tmp4 + tmp3 + _tmp4 = tl.where(r0_mask, tmp5, _tmp4) + tmp4 = tl.sum(_tmp4, 1)[:, None] + tl.store(out_ptr0 + (x3), tmp4, None) diff --git a/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py b/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py new file mode 100644 index 0000000000000000000000000000000000000000..dc19a9bfc2303aabb090f42a09638fd8bb0cd17b --- /dev/null +++ b/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py @@ -0,0 +1,73 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 2048, 'r0_': 4096}, + reduction_hint=ReductionHint.INNER, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 50348032}} +) +@triton.jit +def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 2048 + r0_numel = 4096 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = xindex < xnumel + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = xindex + tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp1 = tmp0.to(tl.float32) + tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) + tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce( + tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0 + ) + tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean) + tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2) + tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight) + tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1) + tmp3 = tmp4[:, None] + tmp7 = tmp5[:, None] + tmp8 = tmp6[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp10 = 1.0 + tmp11 = tmp9 + tmp10 + tmp13 = tmp12.to(tl.float32) + tmp14 = tmp13 - tmp3 + tmp15 = 4096.0 + tmp16 = (tmp7 / tmp15) + tmp17 = 1e-06 + tmp18 = tmp16 + tmp17 + tmp19 = libdevice.rsqrt(tmp18) + tmp20 = tmp14 * tmp19 + tmp21 = tmp20.to(tl.float32) + tmp22 = tmp11 * tmp21 + tmp24 = tmp22 + tmp23 + tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask) diff --git a/torchinductor/ww/dd2f7cfc81aa7ae5bbbff692e51eaea946345cf52243d58440af807766535cf3.best_config b/torchinductor/ww/dd2f7cfc81aa7ae5bbbff692e51eaea946345cf52243d58440af807766535cf3.best_config new file mode 100644 index 0000000000000000000000000000000000000000..dc8de79fb45152deb37fa0f01377313bf011c741 --- /dev/null +++ b/torchinductor/ww/dd2f7cfc81aa7ae5bbbff692e51eaea946345cf52243d58440af807766535cf3.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1, "R0_BLOCK": 4096, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 39, "triton_cache_hash": "MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ"} \ No newline at end of file diff --git a/torchinductor/xj/7c8e81e42663cba3c73a14ca8935673dbe9d521cd0a91444125a99d262815f3f.best_config b/torchinductor/xj/7c8e81e42663cba3c73a14ca8935673dbe9d521cd0a91444125a99d262815f3f.best_config new file mode 100644 index 0000000000000000000000000000000000000000..3da299395a805c9e6c60b214074c2f77852ff8ab --- /dev/null +++ b/torchinductor/xj/7c8e81e42663cba3c73a14ca8935673dbe9d521cd0a91444125a99d262815f3f.best_config @@ -0,0 +1 @@ +{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 46, "triton_cache_hash": "WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA"} \ No newline at end of file diff --git a/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py b/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py new file mode 100644 index 0000000000000000000000000000000000000000..7db8216b2e1229c46cc2844a9596acaa4bcd5f93 --- /dev/null +++ b/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py @@ -0,0 +1,30 @@ + +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 16777216}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 75505664}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_add_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 9437184 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x2 = xindex + x0 = (xindex % 4096) + tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) + tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32) + tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = tmp0 + tmp3 + tl.store(out_ptr0 + (x2), tmp4, None) diff --git a/torchinductor/xo/cxo4pcd7p734icbgoa3zx6bvctofr7g2634t2fqhg22bhtg6eddc.py b/torchinductor/xo/cxo4pcd7p734icbgoa3zx6bvctofr7g2634t2fqhg22bhtg6eddc.py new file mode 100644 index 0000000000000000000000000000000000000000..be536e88a02ee28a06ab69ddbe1e458336720730 --- /dev/null +++ b/torchinductor/xo/cxo4pcd7p734icbgoa3zx6bvctofr7g2634t2fqhg22bhtg6eddc.py @@ -0,0 +1,261 @@ +# AOT ID: ['20_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py +# Topologically Sorted Source Nodes: [context_attn_output, encoder_hidden_states, norm_encoder_hidden_states, add_2, mul_2, norm_encoder_hidden_states_1], Original ATen: [aten.mul, aten.add, aten.native_layer_norm] +# Source node to ATen node mapping: +# add_2 => add_3 +# context_attn_output => mul_1 +# encoder_hidden_states => add_1 +# mul_2 => mul_3 +# norm_encoder_hidden_states => add_2, convert_element_type, convert_element_type_1, mul_2, rsqrt, sub, var_mean +# norm_encoder_hidden_states_1 => add_4 +# Graph fragment: +# %arg5_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=arg5_1] +# %arg3_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg3_1] +# %arg4_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=arg4_1] +# %add_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=add_1] +# %getitem_1 : Tensor "f32[1, 256, 1][256, 1, 256]cuda:0" = PlaceHolder[target=getitem_1] +# %buf2 : Tensor "f32[1, 256, 1][256, 1, 256]cuda:0" = PlaceHolder[target=buf2] +# %arg6_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg6_1] +# %arg7_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg7_1] +# %mul_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg3_1, %arg4_1), kwargs = {}) +# %add_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg5_1, %mul_1), kwargs = {}) +# %convert_element_type : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_1, torch.float32), kwargs = {}) +# %var_mean : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type, [2]), kwargs = {correction: 0, keepdim: True}) +# %sub : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type, %getitem_1), kwargs = {}) +# %add_2 : Tensor "f32[1, 256, 1][256, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem, 1e-06), kwargs = {}) +# %rsqrt : Tensor "f32[1, 256, 1][256, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_2,), kwargs = {}) +# %mul_2 : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, %rsqrt), kwargs = {}) +# %convert_element_type_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_2, torch.bfloat16), kwargs = {}) +# %add_3 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg6_1, 1), kwargs = {}) +# %mul_3 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %add_3), kwargs = {}) +# %add_4 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_3, %arg7_1), kwargs = {}) +# return %add_1,%getitem_1,%buf2,%add_4 +triton_red_fused_add_mul_native_layer_norm_0 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.reduction( + size_hints={'x': 256, 'r0_': 4096}, + reduction_hint=ReductionHint.INNER, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*bf16', 'out_ptr3': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 12607488}} +) +@triton.jit +def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): + xnumel = 256 + r0_numel = 4096 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = xindex < xnumel + r0_base = tl.arange(0, R0_BLOCK)[None, :] + rbase = r0_base + x0 = xindex + tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32) + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp1 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp2 = tl.load(in_ptr2 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = tmp0 + tmp3 + tmp5 = tmp4.to(tl.float32) + tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK]) + tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce( + tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0 + ) + tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean) + tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2) + tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight) + tl.store(out_ptr0 + (r0_1 + 4096*x0), tmp4, r0_mask & xmask) + tmp8, tmp9, tmp10 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1) + tmp7 = tmp8[:, None] + tmp11 = tmp9[:, None] + tmp12 = tmp10[:, None] + for r0_offset in tl.range(0, r0_numel, R0_BLOCK): + r0_index = r0_offset + r0_base + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + tmp13 = tl.load(out_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) + tmp23 = tl.load(in_ptr3 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp27 = tl.load(in_ptr4 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp14 = tmp13.to(tl.float32) + tmp15 = tmp14 - tmp7 + tmp16 = 4096.0 + tmp17 = (tmp11 / tmp16) + tmp18 = 1e-06 + tmp19 = tmp17 + tmp18 + tmp20 = libdevice.rsqrt(tmp19) + tmp21 = tmp15 * tmp20 + tmp22 = tmp21.to(tl.float32) + tmp24 = 1.0 + tmp25 = tmp23 + tmp24 + tmp26 = tmp22 * tmp25 + tmp28 = tmp26 + tmp27 + tl.store(out_ptr3 + (r0_1 + 4096*x0), tmp28, r0_mask & xmask) +''', device_str='cuda') + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py +# Topologically Sorted Source Nodes: [mul, hidden_states], Original ATen: [aten.mul, aten.add] +# Source node to ATen node mapping: +# hidden_states => add +# mul => mul +# Graph fragment: +# %arg2_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg2_1] +# %arg0_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %arg1_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg1_1] +# %mul : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg0_1, %arg1_1), kwargs = {}) +# %add : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg2_1, %mul), kwargs = {}) +# return %add +triton_poi_fused_add_mul_1 = async_compile.triton('triton_poi_fused_add_mul_1', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 8388608}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 67117056}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_add_mul_1(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 8388608 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x2 = xindex + x0 = (xindex % 4096) + tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32) + tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32) + tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32) + tmp3 = tmp1 * tmp2 + tmp4 = tmp0 + tmp3 + tl.store(out_ptr0 + (x2), tmp4, None) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 1, 4096), (24576, 24576, 1)) + assert_size_stride(arg1_1, (1, 2048, 4096), (8388608, 4096, 1)) + assert_size_stride(arg2_1, (1, 2048, 4096), (8388608, 4096, 1)) + assert_size_stride(arg3_1, (1, 1, 4096), (24576, 24576, 1)) + assert_size_stride(arg4_1, (1, 256, 4096), (1048576, 4096, 1)) + assert_size_stride(arg5_1, (1, 256, 4096), (1048576, 4096, 1)) + assert_size_stride(arg6_1, (1, 1, 4096), (24576, 24576, 1)) + assert_size_stride(arg7_1, (1, 1, 4096), (24576, 24576, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((1, 256, 4096), (1048576, 4096, 1), torch.bfloat16) + buf4 = empty_strided_cuda((1, 256, 4096), (1048576, 4096, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [context_attn_output, encoder_hidden_states, norm_encoder_hidden_states, add_2, mul_2, norm_encoder_hidden_states_1], Original ATen: [aten.mul, aten.add, aten.native_layer_norm] + stream0 = get_raw_stream(0) + triton_red_fused_add_mul_native_layer_norm_0.run(arg5_1, arg3_1, arg4_1, arg6_1, arg7_1, buf0, buf4, 256, 4096, stream=stream0) + del arg3_1 + del arg4_1 + del arg5_1 + del arg6_1 + del arg7_1 + buf5 = empty_strided_cuda((1, 2048, 4096), (8388608, 4096, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [mul, hidden_states], Original ATen: [aten.mul, aten.add] + stream0 = get_raw_stream(0) + triton_poi_fused_add_mul_1.run(arg2_1, arg0_1, arg1_1, buf5, 8388608, stream=stream0) + del arg0_1 + del arg1_1 + del arg2_1 + return (buf4, buf5, buf0, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + arg2_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + arg3_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + arg4_1 = rand_strided((1, 256, 4096), (1048576, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + arg5_1 = rand_strided((1, 256, 4096), (1048576, 4096, 1), device='cuda:0', dtype=torch.bfloat16) + arg6_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + arg7_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/yl/cylzp3e6x3k4g5ipjr32zqvdf6mwtqwl2kcqrlshji7fgzldiits.py b/torchinductor/yl/cylzp3e6x3k4g5ipjr32zqvdf6mwtqwl2kcqrlshji7fgzldiits.py new file mode 100644 index 0000000000000000000000000000000000000000..4a39a3a7b4c5fbbf12f9f7105a218296fb4ebff4 --- /dev/null +++ b/torchinductor/yl/cylzp3e6x3k4g5ipjr32zqvdf6mwtqwl2kcqrlshji7fgzldiits.py @@ -0,0 +1,67 @@ +# AOT ID: ['4_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, = args + args.clear() + assert_size_stride(arg0_1, (1, 2048, 12288), (25165824, 12288, 1)) + return (reinterpret_tensor(arg0_1, (1, 2048, 4096), (25165824, 12288, 1), 0), reinterpret_tensor(arg0_1, (1, 2048, 4096), (25165824, 12288, 1), 4096), reinterpret_tensor(arg0_1, (1, 2048, 4096), (25165824, 12288, 1), 8192), ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 2048, 12288), (25165824, 12288, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/zk/czkiayrorhnzt4u2yqgtjl6bgtgtbc3y2p2rzrorom7xd23lxjcj.py b/torchinductor/zk/czkiayrorhnzt4u2yqgtjl6bgtgtbc3y2p2rzrorom7xd23lxjcj.py new file mode 100644 index 0000000000000000000000000000000000000000..1581f87dd8d0e90738aee97b717fd6737170ab77 --- /dev/null +++ b/torchinductor/zk/czkiayrorhnzt4u2yqgtjl6bgtgtbc3y2p2rzrorom7xd23lxjcj.py @@ -0,0 +1,75 @@ +# AOT ID: ['10_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1)) + assert_size_stride(arg1_1, (4096, 4096), (4096, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((256, 4096), (4096, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [hidden_states, split_with_sizes, encoder_hidden_states_1], Original ATen: [aten.view, aten.split_with_sizes, aten.t, aten.mm] + extern_kernels.mm(reinterpret_tensor(arg0_1, (256, 4096), (4096, 1), 0), reinterpret_tensor(arg1_1, (4096, 4096), (1, 4096), 0), out=buf0) + del arg1_1 + return (reinterpret_tensor(arg0_1, (1, 2048, 4096), (9437184, 4096, 1), 1048576), reinterpret_tensor(buf0, (1, 256, 4096), (1048576, 4096, 1), 0), ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((4096, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1, arg1_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/torchinductor/zu/czuvgwm66nfvn4y5p22chgolcyxckgcjsedtcb5l3poxbxvq37ok.py b/torchinductor/zu/czuvgwm66nfvn4y5p22chgolcyxckgcjsedtcb5l3poxbxvq37ok.py new file mode 100644 index 0000000000000000000000000000000000000000..5581a26ccd6a1dd3329cde85d19c0b96207834ef --- /dev/null +++ b/torchinductor/zu/czuvgwm66nfvn4y5p22chgolcyxckgcjsedtcb5l3poxbxvq37ok.py @@ -0,0 +1,149 @@ +# AOT ID: ['27_inference'] +from ctypes import c_void_p, c_long, c_int +import torch +import math +import random +import os +import tempfile +from math import inf, nan +from cmath import nanj +from torch._inductor.hooks import run_intermediate_hooks +from torch._inductor.utils import maybe_profile +from torch._inductor.codegen.memory_planning import _align as align +from torch import device, empty_strided +from torch._inductor.async_compile import AsyncCompile +from torch._inductor.select_algorithm import extern_kernels +import triton +import triton.language as tl +from torch._inductor.runtime.triton_heuristics import start_graph, end_graph +from torch._C import _cuda_getCurrentRawStream as get_raw_stream + +aten = torch.ops.aten +inductor_ops = torch.ops.inductor +_quantized = torch.ops._quantized +assert_size_stride = torch._C._dynamo.guards.assert_size_stride +assert_alignment = torch._C._dynamo.guards.assert_alignment +empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu +empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned +empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda +empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu +empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia +reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor +alloc_from_pool = torch.ops.inductor._alloc_from_pool +async_compile = AsyncCompile() +empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p + + +# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py +# Topologically Sorted Source Nodes: [chunk, hidden_states, silu, x, hidden_states_2], Original ATen: [aten.split, aten.view, aten.silu, aten.mul, aten.cat] +# Source node to ATen node mapping: +# chunk => split +# hidden_states => view +# hidden_states_2 => cat +# silu => convert_element_type, convert_element_type_1, mul, sigmoid +# x => mul_1 +# Graph fragment: +# %arg0_1 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0" = PlaceHolder[target=arg0_1] +# %arg1_1 : Tensor "bf16[1, 2304, 24576][84934656, 36864, 1]cuda:0" = PlaceHolder[target=arg1_1] +# %split : [num_users=2] = call_function[target=torch.ops.aten.split.Tensor](args = (%arg1_1, 12288, -1), kwargs = {}) +# %view : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg0_1, [1, 2304, 4096]), kwargs = {}) +# %convert_element_type : Tensor "f32[1, 2304, 12288][28311552, 12288, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem, torch.float32), kwargs = {}) +# %sigmoid : Tensor "f32[1, 2304, 12288][28311552, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type,), kwargs = {}) +# %mul : Tensor "f32[1, 2304, 12288][28311552, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %sigmoid), kwargs = {}) +# %convert_element_type_1 : Tensor "bf16[1, 2304, 12288][28311552, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {}) +# %mul_1 : Tensor "bf16[1, 2304, 12288][28311552, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %getitem_1), kwargs = {}) +# %cat : Tensor "bf16[1, 2304, 16384][37748736, 16384, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%view, %mul_1], -1), kwargs = {}) +# return %cat +triton_poi_fused_cat_mul_silu_split_view_0 = async_compile.triton('triton_poi_fused_cat_mul_silu_split_view_0', ''' +import triton +import triton.language as tl + +from torch._inductor.runtime import triton_helpers, triton_heuristics +from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math +from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties +triton_helpers.set_driver_to_gpu() + +@triton_heuristics.pointwise( + size_hints={'x': 67108864}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_mul_silu_split_view_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 377487360}}, + min_elem_per_thread=0 +) +@triton.jit +def triton_poi_fused_cat_mul_silu_split_view_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 37748736 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x0 = (xindex % 16384) + x1 = xindex // 16384 + x2 = xindex + tmp0 = x0 + tmp1 = tl.full([1], 0, tl.int64) + tmp2 = tmp0 >= tmp1 + tmp3 = tl.full([1], 4096, tl.int64) + tmp4 = tmp0 < tmp3 + tmp5 = tl.load(in_ptr0 + (4096*x1 + (x0)), tmp4, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp6 = tmp0 >= tmp3 + tmp7 = tl.full([1], 16384, tl.int64) + tmp8 = tmp0 < tmp7 + tmp9 = tl.load(in_ptr1 + (36864*x1 + ((-4096) + x0)), tmp6, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp10 = tmp9.to(tl.float32) + tmp11 = tl.sigmoid(tmp10) + tmp12 = tmp10 * tmp11 + tmp13 = tmp12.to(tl.float32) + tmp14 = tl.load(in_ptr1 + (12288 + 36864*x1 + ((-4096) + x0)), tmp6, eviction_policy='evict_last', other=0.0).to(tl.float32) + tmp15 = tmp13 * tmp14 + tmp16 = tl.full(tmp15.shape, 0.0, tmp15.dtype) + tmp17 = tl.where(tmp6, tmp15, tmp16) + tmp18 = tl.where(tmp4, tmp5, tmp17) + tl.store(out_ptr0 + (x2), tmp18, None) +''', device_str='cuda') + + +async_compile.wait(globals()) +del async_compile + +class Runner: + def __init__(self, partitions): + self.partitions = partitions + + def recursively_apply_fns(self, fns): + new_callables = [] + for fn, c in zip(fns, self.partitions): + new_callables.append(fn(c)) + self.partitions = new_callables + + def call(self, args): + arg0_1, arg1_1 = args + args.clear() + assert_size_stride(arg0_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1)) + assert_size_stride(arg1_1, (1, 2304, 24576), (84934656, 36864, 1)) + with torch.cuda._DeviceGuard(0): + torch.cuda.set_device(0) + buf0 = empty_strided_cuda((1, 2304, 16384), (37748736, 16384, 1), torch.bfloat16) + # Topologically Sorted Source Nodes: [chunk, hidden_states, silu, x, hidden_states_2], Original ATen: [aten.split, aten.view, aten.silu, aten.mul, aten.cat] + stream0 = get_raw_stream(0) + triton_poi_fused_cat_mul_silu_split_view_0.run(arg0_1, arg1_1, buf0, 37748736, stream=stream0) + del arg0_1 + del arg1_1 + return (buf0, ) + +runner = Runner(partitions=[]) +call = runner.call +recursively_apply_fns = runner.recursively_apply_fns + + +def benchmark_compiled_module(times=10, repeat=10): + from torch._dynamo.testing import rand_strided + from torch._inductor.utils import print_performance + arg0_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16) + arg1_1 = rand_strided((1, 2304, 24576), (84934656, 36864, 1), device='cuda:0', dtype=torch.bfloat16) + fn = lambda: call([arg0_1, arg1_1]) + return print_performance(fn, times=times, repeat=repeat) + + +if __name__ == "__main__": + from torch._inductor.wrapper_benchmark import compiled_module_main + compiled_module_main('None', benchmark_compiled_module) diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/__grp__triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a5edc15f871f8dbba4d3ec899e0f0834afcbd7ee --- /dev/null +++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/__grp__triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.json"}} \ No newline at end of file diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..79c64d9c3967bcb0ac58897b9f0c2e757a8d2a4b Binary files /dev/null and b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.cubin differ diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.json b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ff3a7c01c258341e08008e5565140185ef0802f1 --- /dev/null +++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"hash": "d74f134c95c0d6dfc5a74151e2823c30e5664d0592fbb6b3bd01bbcf98b13272", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 8, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"} \ No newline at end of file diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.llir b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..d5c9c0620201d932a29266ca8a946ba9e4fd23fa --- /dev/null +++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.llir @@ -0,0 +1,136 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl nuw i32 %7, 1, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 32, !dbg !9 + %.lobit = lshr exact i32 %10, 5, !dbg !9 + %11 = and i32 %9, 1, !dbg !9 + %12 = or disjoint i32 %.lobit, %8, !dbg !10 + %13 = or disjoint i32 %8, %11, !dbg !10 + %14 = shl nuw nsw i32 %9, 2, !dbg !11 + %15 = and i32 %14, 124, !dbg !11 + %16 = sdiv i32 %12, 32, !dbg !12 + %17 = mul i32 %16, 32, !dbg !13 + %.decomposed = sub i32 %12, %17, !dbg !13 + %18 = shl nsw i32 %.decomposed, 7, !dbg !14 + %19 = or disjoint i32 %18, %15, !dbg !15 + %20 = mul i32 %16, 12288, !dbg !16 + %21 = add i32 %19, %20, !dbg !17 + %22 = sext i32 %21 to i64, !dbg !18 + %23 = getelementptr bfloat, ptr addrspace(1) %0, i64 %22, !dbg !18 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %25 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %23, i64 %24, i1 true) #4, !dbg !19 + %26 = extractvalue { i32, i32 } %25, 0, !dbg !19 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !19 + %28 = extractvalue { i32, i32 } %25, 1, !dbg !19 + %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !19 + %30 = extractelement <2 x bfloat> %27, i64 0, !dbg !19 + %31 = extractelement <2 x bfloat> %27, i64 1, !dbg !19 + %32 = extractelement <2 x bfloat> %29, i64 0, !dbg !19 + %33 = extractelement <2 x bfloat> %29, i64 1, !dbg !19 + %34 = fpext bfloat %30 to float, !dbg !20 + %35 = fpext bfloat %31 to float, !dbg !20 + %36 = fpext bfloat %32 to float, !dbg !20 + %37 = fpext bfloat %33 to float, !dbg !20 + %38 = fmul float %34, %34, !dbg !21 + %39 = fmul float %35, %35, !dbg !21 + %40 = fmul float %36, %36, !dbg !21 + %41 = fmul float %37, %37, !dbg !21 + %42 = fadd float %38, %39, !dbg !22 + %43 = fadd float %40, %42, !dbg !22 + %44 = fadd float %41, %43, !dbg !22 + %45 = bitcast float %44 to i32, !dbg !25 + %46 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %45, i32 16, i32 31), !dbg !25 + %47 = bitcast i32 %46 to float, !dbg !25 + %48 = fadd float %44, %47, !dbg !22 + %49 = bitcast float %48 to i32, !dbg !25 + %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %49, i32 8, i32 31), !dbg !25 + %51 = bitcast i32 %50 to float, !dbg !25 + %52 = fadd float %48, %51, !dbg !22 + %53 = bitcast float %52 to i32, !dbg !25 + %54 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 4, i32 31), !dbg !25 + %55 = bitcast i32 %54 to float, !dbg !25 + %56 = fadd float %52, %55, !dbg !22 + %57 = bitcast float %56 to i32, !dbg !25 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %57, i32 2, i32 31), !dbg !25 + %59 = bitcast i32 %58 to float, !dbg !25 + %60 = fadd float %56, %59, !dbg !22 + %61 = bitcast float %60 to i32, !dbg !25 + %62 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %61, i32 1, i32 31), !dbg !25 + %63 = bitcast i32 %62 to float, !dbg !25 + %64 = fadd float %60, %63, !dbg !22 + %65 = lshr exact i32 %10, 3, !dbg !28 + %66 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %65, !dbg !28 + store float %64, ptr addrspace(3) %66, align 4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %67 = shl nuw nsw i32 %11, 2, !dbg !28 + %68 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %67, !dbg !28 + %69 = load i32, ptr addrspace(3) %68, align 4, !dbg !28 + %70 = sext i32 %13 to i64, !dbg !29 + %71 = getelementptr float, ptr addrspace(1) %1, i64 %70, !dbg !29 + %72 = and i32 %9, 62, !dbg !30 + %73 = icmp eq i32 %72, 0, !dbg !30 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %69, ptr addrspace(1) %71, i1 %73) #4, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 41, scope: !4) +!16 = !DILocation(line: 38, column: 56, scope: !4) +!17 = !DILocation(line: 38, column: 50, scope: !4) +!18 = !DILocation(line: 38, column: 34, scope: !4) +!19 = !DILocation(line: 38, column: 61, scope: !4) +!20 = !DILocation(line: 38, column: 115, scope: !4) +!21 = !DILocation(line: 40, column: 22, scope: !4) +!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26) +!26 = !DILocation(line: 44, column: 25, scope: !27) +!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!28 = !DILocation(line: 44, column: 28, scope: !4) +!29 = !DILocation(line: 45, column: 25, scope: !4) +!30 = !DILocation(line: 45, column: 36, scope: !4) +!31 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..87e167b9babc8e2c5bfd09dae9831acb29db0fff --- /dev/null +++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ptx @@ -0,0 +1,506 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_1 +.visible .entry triton_red_fused__fused_rms_norm_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5 +) +.reqntid 64 +{ + .reg .pred %p<3>; + .reg .b16 %rs<5>; + .reg .b32 %r<48>; + .reg .b64 %rd<6>; + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_red_fused__fused_rms_norm_view_1_param_0]; + ld.param.b64 %rd5, [triton_red_fused__fused_rms_norm_view_1_param_1]; +$L__tmp0: + .loc 1 23 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28 + mov.u32 %r5, %ctaid.x; + .loc 1 23 33 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33 + shl.b32 %r6, %r5, 1; + .loc 1 24 44 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44 + mov.u32 %r7, %tid.x; + and.b32 %r8, %r7, 32; + bfe.u32 %r9, %r7, 5, 1; + and.b32 %r10, %r7, 1; + .loc 1 24 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23 + or.b32 %r11, %r9, %r6; + or.b32 %r12, %r6, %r10; + .loc 1 26 37 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37 + shl.b32 %r13, %r7, 2; + and.b32 %r14, %r13, 124; + .loc 1 29 19 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19 + bfe.s32 %r15, %r5, 30, 1; + shr.u32 %r16, %r15, 27; + add.s32 %r17, %r11, %r16; + shr.u32 %r18, %r17, 5; + .loc 1 28 19 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:28:19 + and.b32 %r19, %r17, 33554400; + sub.s32 %r20, %r11, %r19; + .loc 1 38 45 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:45 + shl.b32 %r21, %r20, 7; + .loc 1 38 41 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:41 + or.b32 %r22, %r21, %r14; + .loc 1 38 50 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:50 + mad.lo.s32 %r23, %r18, 12288, %r22; + .loc 1 38 34 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34 + mad.wide.s32 %rd1, %r23, 2, %rd4; + .loc 1 38 61 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + .loc 1 38 115 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115 + cvt.f32.bf16 %r24, %rs1; + cvt.f32.bf16 %r25, %rs2; + cvt.f32.bf16 %r26, %rs3; + cvt.f32.bf16 %r27, %rs4; + .loc 1 40 22 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22 + mul.f32 %r28, %r25, %r25; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + fma.rn.f32 %r29, %r24, %r24, %r28; + fma.rn.f32 %r30, %r26, %r26, %r29; + fma.rn.f32 %r31, %r27, %r27, %r30; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r32, %r31, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r33, %r31, %r32; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r34, %r33, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r35, %r33, %r34; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r36, %r35, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r37, %r35, %r36; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r39, %r37, %r38; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r40, %r39, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r41, %r39, %r40; +$L__tmp12: + .loc 1 44 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28 + shr.u32 %r42, %r8, 3; + mov.b32 %r43, global_smem; + add.s32 %r44, %r43, %r42; + st.shared.b32 [%r44], %r41; + bar.sync 0; + shl.b32 %r45, %r10, 2; + add.s32 %r46, %r43, %r45; + ld.shared.b32 %r4, [%r46]; + .loc 1 45 25 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25 + mad.wide.s32 %rd3, %r12, 4, %rd5; + .loc 1 45 36 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36 + and.b32 %r47, %r7, 62; + setp.eq.b32 %p2, %r47, 0; + // begin inline asm + @%p2 st.global.b32 [ %rd3 + 0 ], { %r4 }; + // end inline asm + .loc 1 45 4 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4 + ret; +$L__tmp13: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 113 +.b8 105 +.b8 116 +.b8 120 +.b8 53 +.b8 104 +.b8 119 +.b8 117 +.b8 112 +.b8 107 +.b8 98 +.b8 106 +.b8 109 +.b8 99 +.b8 115 +.b8 111 +.b8 121 +.b8 107 +.b8 113 +.b8 101 +.b8 112 +.b8 122 +.b8 113 +.b8 99 +.b8 55 +.b8 122 +.b8 99 +.b8 120 +.b8 106 +.b8 99 +.b8 98 +.b8 53 +.b8 97 +.b8 99 +.b8 113 +.b8 107 +.b8 105 +.b8 55 +.b8 122 +.b8 99 +.b8 115 +.b8 106 +.b8 105 +.b8 102 +.b8 114 +.b8 110 +.b8 114 +.b8 122 +.b8 99 +.b8 114 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 113 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.source b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..21b1b4c3a3a4ad24327b73f7cacdb6bdcf03a3c1 --- /dev/null +++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 65536 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 2 : i32 loc(#loc49) + %xoffset_3 = arith.constant 2 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<2x128xi1> loc(#loc53) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<2x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<2x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c128_i32 = arith.constant 128 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<2x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<2x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<2x128xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<2x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<2x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<2x128xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<2x128x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<2x128xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<2x128xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<2x128xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<2x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<2x1x!tt.ptr>, tensor<2x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<2x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x128xf32> loc("input"(#loc33))) -> tensor<2xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc34) + tt.return %0 : tensor<2xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2xf32> loc(#loc37) + tt.return %1 : tensor<2xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..66c29e1103cd2585e55b8e2980219703a9faa89b --- /dev/null +++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttgir @@ -0,0 +1,108 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc1 = loc(unknown) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc27 = loc("in_ptr0"(#loc)) +#loc28 = loc("out_ptr0"(#loc)) +#loc29 = loc("xnumel"(#loc)) +#loc30 = loc("r0_numel"(#loc)) +#loc49 = loc("tmp4"(#loc21)) +#loc52 = loc(callsite(#loc1 at #loc49)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<2x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<2x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<2x1xi32, #blocked> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc31) + %xoffset_5 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc32) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33) + %xindex_6 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2x1xi32, #blocked> loc(#loc33) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xi32, #blocked1> loc(#loc33) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<2x1xi32, #blocked> loc(#loc34) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<2x1xi32, #blocked1> loc(#loc34) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<2x1xi32, #blocked> loc(#loc34) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<2x1xi32, #blocked1> loc(#loc34) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35) + %x0 = arith.remsi %xindex_11, %cst_2 : tensor<2x1xi32, #blocked> loc(#loc36) + %x1 = arith.divsi %xindex_11, %cst_2 : tensor<2x1xi32, #blocked> loc(#loc37) + %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38) + %tmp0 = arith.muli %x0, %cst_0 : tensor<2x1xi32, #blocked> loc(#loc39) + %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc40) + %tmp0_15 = tt.broadcast %tmp0 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc40) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<2x128xi32, #blocked> loc(#loc40) + %tmp0_17 = arith.muli %x1, %cst_1 : tensor<2x1xi32, #blocked> loc(#loc41) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc42) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<2x128xi32, #blocked> loc(#loc42) + %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr, #blocked> loc(#loc43) + %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<2x128x!tt.ptr, #blocked>, tensor<2x128xi32, #blocked> loc(#loc43) + %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc44) + %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<2x128x!tt.ptr, #blocked> loc(#loc44) + %tmp0_24 = arith.extf %tmp0_23 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc45) + %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<2x128xf32, #blocked> loc(#loc46) + %tmp5 = arith.addf %tmp2, %cst_4 : tensor<2x128xf32, #blocked> loc(#loc47) + %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc48) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))): + %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53) + tt.reduce.return %tmp4_29 : f32 loc(#loc51) + }) : (tensor<2x128xf32, #blocked>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51) + %tmp4_25 = ttg.convert_layout %tmp4 : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50) + %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc50) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<2x1x!tt.ptr, #blocked1> loc(#loc24) + %1 = tt.addptr %0, %xindex_12 : tensor<2x1x!tt.ptr, #blocked1>, tensor<2x1xi32, #blocked1> loc(#loc24) + tt.store %1, %tmp4_26 : tensor<2x1x!tt.ptr, #blocked1> loc(#loc25) + tt.return loc(#loc26) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc31 = loc("xoffset"(#loc2)) +#loc32 = loc("xoffset"(#loc3)) +#loc33 = loc("xindex"(#loc4)) +#loc34 = loc("xindex"(#loc5)) +#loc35 = loc("r0_base"(#loc6)) +#loc36 = loc("x0"(#loc7)) +#loc37 = loc("x1"(#loc8)) +#loc38 = loc("r0_mask"(#loc9)) +#loc39 = loc("tmp0"(#loc10)) +#loc40 = loc("tmp0"(#loc11)) +#loc41 = loc("tmp0"(#loc12)) +#loc42 = loc("tmp0"(#loc13)) +#loc43 = loc("tmp0"(#loc14)) +#loc44 = loc("tmp0"(#loc15)) +#loc45 = loc("tmp0"(#loc16)) +#loc46 = loc("tmp2"(#loc17)) +#loc47 = loc("tmp5"(#loc18)) +#loc48 = loc("_tmp4"(#loc19)) +#loc50 = loc("tmp4"(#loc23)) +#loc51 = loc(callsite(#loc20 at #loc49)) +#loc53 = loc(callsite(#loc22 at #loc51)) diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..44e28b7eb9c38afd708cdb430c86636f80325231 --- /dev/null +++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttir @@ -0,0 +1,105 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc2 = loc(unknown) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc29 = loc("in_ptr0"(#loc)) +#loc30 = loc("out_ptr0"(#loc)) +#loc31 = loc("xnumel"(#loc)) +#loc32 = loc("r0_numel"(#loc)) +#loc53 = loc("tmp4"(#loc23)) +#loc56 = loc(callsite(#loc2 at #loc53)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16> loc(#loc33) + %cst = arith.constant dense<12288> : tensor<2x1xi32> loc(#loc2) + %cst_0 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc2) + %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc2) + %cst_3 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc2) + %c2_i32 = arith.constant 2 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_4 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc36) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc37) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc38) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc38) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40) + %x0 = arith.remsi %xindex_7, %cst_3 : tensor<2x1xi32> loc(#loc41) + %x1 = arith.divsi %xindex_7, %cst_3 : tensor<2x1xi32> loc(#loc42) + %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43) + %tmp0_9 = arith.muli %x0, %cst_0 : tensor<2x1xi32> loc(#loc44) + %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc45) + %tmp0_11 = tt.broadcast %tmp0_9 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc45) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<2x128xi32> loc(#loc45) + %tmp0_13 = arith.muli %x1, %cst : tensor<2x1xi32> loc(#loc46) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc47) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<2x128xi32> loc(#loc47) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc48) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc48) + %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc33) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<2x128x!tt.ptr> loc(#loc33) + %tmp0_20 = arith.extf %tmp0_19 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc49) + %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<2x128xf32> loc(#loc50) + %tmp5 = arith.addf %tmp2, %cst_2 : tensor<2x128xf32> loc(#loc51) + %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc52) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))): + %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57) + tt.reduce.return %tmp4_24 : f32 loc(#loc55) + }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc55) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc54) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<2x1x!tt.ptr> loc(#loc26) + %1 = tt.addptr %0, %xindex_7 : tensor<2x1x!tt.ptr>, tensor<2x1xi32> loc(#loc26) + tt.store %1, %tmp4_21 : tensor<2x1x!tt.ptr> loc(#loc27) + tt.return loc(#loc28) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc33 = loc("tmp0"(#loc1)) +#loc34 = loc("xoffset"(#loc3)) +#loc35 = loc("xoffset"(#loc4)) +#loc36 = loc("xindex"(#loc5)) +#loc37 = loc("xindex"(#loc6)) +#loc38 = loc("xindex"(#loc7)) +#loc39 = loc("r0_base"(#loc8)) +#loc40 = loc("r0_base"(#loc9)) +#loc41 = loc("x0"(#loc10)) +#loc42 = loc("x1"(#loc11)) +#loc43 = loc("r0_mask"(#loc12)) +#loc44 = loc("tmp0"(#loc13)) +#loc45 = loc("tmp0"(#loc14)) +#loc46 = loc("tmp0"(#loc15)) +#loc47 = loc("tmp0"(#loc16)) +#loc48 = loc("tmp0"(#loc17)) +#loc49 = loc("tmp0"(#loc18)) +#loc50 = loc("tmp2"(#loc19)) +#loc51 = loc("tmp5"(#loc20)) +#loc52 = loc("_tmp4"(#loc21)) +#loc54 = loc("tmp4"(#loc25)) +#loc55 = loc(callsite(#loc22 at #loc53)) +#loc57 = loc(callsite(#loc24 at #loc55)) diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/__grp__triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dd1f0f6ba8c6fbbe3583f5146e7473ad6cdf12c1 --- /dev/null +++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/__grp__triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.json"}} \ No newline at end of file diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e87070faca30a45589ed7d5d5cd5916011544f24 Binary files /dev/null and b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.cubin differ diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.json b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8fdca38b3acaf627358eb68862191313b7f624e8 --- /dev/null +++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"hash": "d8e0c84cc73610ffde4fe60a570ac315dcce568ed160cf675e002966ca2b3448", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"} \ No newline at end of file diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.llir b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..fc51d50082194daceaae04fbf4382e738745fc48 --- /dev/null +++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.llir @@ -0,0 +1,140 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 6, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 126, !dbg !9 + %11 = lshr exact i32 %10, 1, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = shl nuw nsw i32 %9, 2, !dbg !11 + %14 = and i32 %13, 4, !dbg !11 + %15 = sdiv i32 %12, 32, !dbg !12 + %16 = mul i32 %15, 32, !dbg !13 + %.decomposed = sub i32 %12, %16, !dbg !13 + %17 = shl nsw i32 %.decomposed, 7, !dbg !14 + %18 = mul i32 %15, 12288, !dbg !15 + %19 = or disjoint i32 %17, %14 + %20 = add i32 %19, %18 + br label %21, !dbg !16 + +21: ; preds = %6, %21 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %21 ] + %22 = phi float [ 0.000000e+00, %6 ], [ %48, %21 ] + %23 = phi float [ 0.000000e+00, %6 ], [ %49, %21 ] + %24 = phi float [ 0.000000e+00, %6 ], [ %50, %21 ] + %25 = phi float [ 0.000000e+00, %6 ], [ %51, %21 ] + %26 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !17 + %27 = add i32 %20, %26, !dbg !17 + %28 = sext i32 %27 to i64, !dbg !18 + %29 = getelementptr bfloat, ptr addrspace(1) %0, i64 %28, !dbg !18 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %29, i64 %30, i1 true) #4, !dbg !19 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !19 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !19 + %34 = extractvalue { i32, i32 } %31, 1, !dbg !19 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !19 + %36 = extractelement <2 x bfloat> %33, i64 0, !dbg !19 + %37 = extractelement <2 x bfloat> %33, i64 1, !dbg !19 + %38 = extractelement <2 x bfloat> %35, i64 0, !dbg !19 + %39 = extractelement <2 x bfloat> %35, i64 1, !dbg !19 + %40 = fpext bfloat %36 to float, !dbg !20 + %41 = fpext bfloat %37 to float, !dbg !20 + %42 = fpext bfloat %38 to float, !dbg !20 + %43 = fpext bfloat %39 to float, !dbg !20 + %44 = fmul float %40, %40, !dbg !21 + %45 = fmul float %41, %41, !dbg !21 + %46 = fmul float %42, %42, !dbg !21 + %47 = fmul float %43, %43, !dbg !21 + %48 = fadd float %22, %44, !dbg !22 + %49 = fadd float %23, %45, !dbg !22 + %50 = fadd float %24, %46, !dbg !22 + %51 = fadd float %25, %47, !dbg !22 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8, !dbg !16 + %52 = icmp samesign ult i64 %indvars.iv, 120, !dbg !16 + br i1 %52, label %21, label %53, !dbg !16 + +53: ; preds = %21 + %54 = and i32 %9, 63, !dbg !9 + %55 = or disjoint i32 %8, %54, !dbg !10 + %56 = fadd float %48, %49, !dbg !23 + %57 = fadd float %50, %56, !dbg !23 + %58 = fadd float %51, %57, !dbg !23 + %59 = bitcast float %58 to i32, !dbg !26 + %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 1, i32 31), !dbg !26 + %61 = bitcast i32 %60 to float, !dbg !26 + %62 = fadd float %58, %61, !dbg !23 + %63 = shl nuw nsw i32 %10, 1, !dbg !29 + %64 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %63, !dbg !29 + store float %62, ptr addrspace(3) %64, align 4, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29 + %65 = shl nuw nsw i32 %54, 2, !dbg !29 + %66 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %65, !dbg !29 + %67 = load i32, ptr addrspace(3) %66, align 4, !dbg !29 + %68 = sext i32 %55 to i64, !dbg !30 + %69 = getelementptr float, ptr addrspace(1) %1, i64 %68, !dbg !30 + %70 = and i32 %9, 64, !dbg !31 + %71 = icmp eq i32 %70, 0, !dbg !31 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %67, ptr addrspace(1) %69, i1 %71) #4, !dbg !31 + ret void, !dbg !32 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 56, scope: !4) +!16 = !DILocation(line: 32, column: 43, scope: !4) +!17 = !DILocation(line: 38, column: 50, scope: !4) +!18 = !DILocation(line: 38, column: 34, scope: !4) +!19 = !DILocation(line: 38, column: 61, scope: !4) +!20 = !DILocation(line: 38, column: 115, scope: !4) +!21 = !DILocation(line: 40, column: 22, scope: !4) +!22 = !DILocation(line: 42, column: 23, scope: !4) +!23 = !DILocation(line: 263, column: 15, scope: !24, inlinedAt: !26) +!24 = distinct !DILexicalBlockFile(scope: !4, file: !25, discriminator: 0) +!25 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!26 = !DILocation(line: 293, column: 36, scope: !24, inlinedAt: !27) +!27 = !DILocation(line: 44, column: 25, scope: !28) +!28 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!29 = !DILocation(line: 44, column: 28, scope: !4) +!30 = !DILocation(line: 45, column: 25, scope: !4) +!31 = !DILocation(line: 45, column: 36, scope: !4) +!32 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..9a23b53fb69f273a74c936c42a0d5c1c69b6691d --- /dev/null +++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ptx @@ -0,0 +1,499 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_1 +.visible .entry triton_red_fused__fused_rms_norm_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5 +) +.reqntid 128 +{ + .reg .pred %p<4>; + .reg .b16 %rs<5>; + .reg .b32 %r<45>; + .reg .b64 %rd<9>; + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_red_fused__fused_rms_norm_view_1_param_1]; + ld.param.b64 %rd2, [triton_red_fused__fused_rms_norm_view_1_param_0]; +$L__tmp0: + .loc 1 23 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28 + mov.u32 %r4, %ctaid.x; + .loc 1 23 33 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33 + shl.b32 %r1, %r4, 6; + .loc 1 24 44 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 126; + bfe.u32 %r5, %r2, 1, 6; + .loc 1 24 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23 + or.b32 %r6, %r5, %r1; + .loc 1 26 37 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37 + shl.b32 %r7, %r2, 2; + and.b32 %r8, %r7, 4; + .loc 1 29 19 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19 + bfe.s32 %r9, %r4, 25, 1; + shr.u32 %r10, %r9, 27; + add.s32 %r11, %r6, %r10; + shr.u32 %r12, %r11, 5; + .loc 1 32 43 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:32:43 + add.s32 %r13, %r4, %r12; + shl.b32 %r14, %r13, 13; + shl.b32 %r15, %r5, 7; + or.b32 %r16, %r14, %r15; + or.b32 %r17, %r16, %r8; + cvt.u64.u32 %rd1, %r17; + mov.b32 %r41, 0f00000000; + mov.b64 %rd8, -8; + mov.b32 %r42, %r41; + mov.b32 %r43, %r41; + mov.b32 %r44, %r41; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 38 34 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34 + add.s64 %rd6, %rd1, %rd8; + cvt.u32.u64 %r21, %rd6; + add.s32 %r22, %r21, 8; + mad.wide.s32 %rd5, %r22, 2, %rd2; + .loc 1 38 61 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0; + // end inline asm + mov.b32 %r20, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r18, %r20; + mov.u32 %r19, %r20; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r18, %r19 }, [ %rd5 + 0 ], %rd4; + // end inline asm + mov.b32 {%rs1, %rs2}, %r18; + mov.b32 {%rs3, %rs4}, %r19; + .loc 1 38 115 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115 + cvt.f32.bf16 %r23, %rs1; + cvt.f32.bf16 %r24, %rs2; + cvt.f32.bf16 %r25, %rs3; + cvt.f32.bf16 %r26, %rs4; + .loc 1 42 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23 + fma.rn.f32 %r41, %r23, %r23, %r41; + fma.rn.f32 %r42, %r24, %r24, %r42; + fma.rn.f32 %r43, %r25, %r25, %r43; + fma.rn.f32 %r44, %r26, %r26, %r44; + .loc 1 32 43 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:32:43 + add.s64 %rd8, %rd8, 8; + setp.lt.u64 %p2, %rd8, 120; + @%p2 bra $L__BB0_1; +// %bb.2: + .loc 1 24 44 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44 + and.b32 %r28, %r2, 63; + .loc 1 24 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23 + or.b32 %r29, %r1, %r28; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r30, %r41, %r42; + add.f32 %r31, %r43, %r30; + add.f32 %r32, %r44, %r31; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r33, %r32, 1, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r34, %r32, %r33; +$L__tmp4: + .loc 1 44 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28 + shl.b32 %r35, %r3, 1; + mov.b32 %r36, global_smem; + add.s32 %r37, %r36, %r35; + st.shared.b32 [%r37], %r34; + bar.sync 0; + shl.b32 %r38, %r28, 2; + add.s32 %r39, %r36, %r38; + ld.shared.b32 %r27, [%r39]; + .loc 1 45 25 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25 + mad.wide.s32 %rd7, %r29, 4, %rd3; + .loc 1 45 36 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36 + and.b32 %r40, %r2, 64; + setp.eq.b32 %p3, %r40, 0; + // begin inline asm + @%p3 st.global.b32 [ %rd7 + 0 ], { %r27 }; + // end inline asm + .loc 1 45 4 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 113 +.b8 105 +.b8 116 +.b8 120 +.b8 53 +.b8 104 +.b8 119 +.b8 117 +.b8 112 +.b8 107 +.b8 98 +.b8 106 +.b8 109 +.b8 99 +.b8 115 +.b8 111 +.b8 121 +.b8 107 +.b8 113 +.b8 101 +.b8 112 +.b8 122 +.b8 113 +.b8 99 +.b8 55 +.b8 122 +.b8 99 +.b8 120 +.b8 106 +.b8 99 +.b8 98 +.b8 53 +.b8 97 +.b8 99 +.b8 113 +.b8 107 +.b8 105 +.b8 55 +.b8 122 +.b8 99 +.b8 115 +.b8 106 +.b8 105 +.b8 102 +.b8 114 +.b8 110 +.b8 114 +.b8 122 +.b8 99 +.b8 114 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 113 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.source b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..4f065d51891ed493c167e6a6ca7939ec6c30d04e --- /dev/null +++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 65536 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 64 : i32 loc(#loc49) + %xoffset_3 = arith.constant 64 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<64x8xi1> loc(#loc53) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c8_i32 = arith.constant 8 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c8_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<64x8xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x8xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x8xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x8xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<64x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<64x8xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<64x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<64x8xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<64x8x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<64x8xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<64x8xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<64x8xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<64x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x8xf32> loc("input"(#loc33))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc34) + tt.return %0 : tensor<64xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc37) + tt.return %1 : tensor<64xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..d4a9e6bec47fe30fb029ebd51cb004904dd8e618 --- /dev/null +++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttgir @@ -0,0 +1,121 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc1 = loc(unknown) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc30 = loc("in_ptr0"(#loc)) +#loc31 = loc("out_ptr0"(#loc)) +#loc32 = loc("xnumel"(#loc)) +#loc33 = loc("r0_numel"(#loc)) +#loc54 = loc("tmp4"(#loc24)) +#loc57 = loc(callsite(#loc1 at #loc54)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x8xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36) + %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc36) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc36) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc37) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc37) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc37) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc37) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x8xi32, #blocked> loc(#loc38) + %x0 = arith.remsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc39) + %x1 = arith.divsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc40) + %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc41) + %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc42) + %tmp0_15 = arith.muli %x1, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc43) + %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc44) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked> loc(#loc45) + %_tmp4 = scf.for %_tmp4_20 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg5 = %cst_4) -> (tensor<64x8xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp4_20 : i32 -> tensor<1x8xi32, #blocked> loc(#loc47) + %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x8xi32, #blocked> loc(#loc47) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst : tensor<1x8xi32, #blocked> loc(#loc48) + %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x8xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc42) + %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<64x8xi32, #blocked> loc(#loc42) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<64x8xi32, #blocked> loc(#loc44) + %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> loc(#loc45) + %tmp0_26 = tt.broadcast %r0_mask : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc49) + %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_3 evictionPolicy = evict_first : tensor<64x8x!tt.ptr, #blocked> loc(#loc49) + %tmp0_28 = arith.extf %tmp0_27 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc50) + %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<64x8xf32, #blocked> loc(#loc51) + %tmp5 = arith.addf %arg5, %tmp2 : tensor<64x8xf32, #blocked> loc(#loc52) + %_tmp4_29 = arith.select %tmp0_26, %tmp5, %arg5 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc53) + scf.yield %_tmp4_29 : tensor<64x8xf32, #blocked> loc(#loc22) + } loc(#loc46) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))): + %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58) + tt.reduce.return %tmp4_22 : f32 loc(#loc56) + }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56) + %tmp4_18 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55) + %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc55) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc27) + %1 = tt.addptr %0, %xindex_12 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc27) + tt.store %1, %tmp4_19 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc28) + tt.return loc(#loc29) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc34 = loc("xoffset"(#loc2)) +#loc35 = loc("xoffset"(#loc3)) +#loc36 = loc("xindex"(#loc4)) +#loc37 = loc("xindex"(#loc5)) +#loc38 = loc("r0_base"(#loc6)) +#loc39 = loc("x0"(#loc7)) +#loc40 = loc("x1"(#loc8)) +#loc41 = loc("tmp0"(#loc9)) +#loc42 = loc("tmp0"(#loc10)) +#loc43 = loc("tmp0"(#loc11)) +#loc44 = loc("tmp0"(#loc12)) +#loc45 = loc("tmp0"(#loc13)) +#loc46 = loc("_tmp4"(#loc14)) +#loc47 = loc("r0_index"(#loc15)) +#loc48 = loc("r0_mask"(#loc16)) +#loc49 = loc("tmp0"(#loc17)) +#loc50 = loc("tmp0"(#loc18)) +#loc51 = loc("tmp2"(#loc19)) +#loc52 = loc("tmp5"(#loc20)) +#loc53 = loc("_tmp4"(#loc21)) +#loc55 = loc("tmp4"(#loc26)) +#loc56 = loc(callsite(#loc23 at #loc54)) +#loc58 = loc(callsite(#loc25 at #loc56)) diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..407429e35d55d7116140391b571e3ab6691d167b --- /dev/null +++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttir @@ -0,0 +1,118 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc1 = loc(unknown) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc32 = loc("in_ptr0"(#loc)) +#loc33 = loc("out_ptr0"(#loc)) +#loc34 = loc("xnumel"(#loc)) +#loc35 = loc("r0_numel"(#loc)) +#loc58 = loc("tmp4"(#loc26)) +#loc61 = loc(callsite(#loc1 at #loc58)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc2) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<1x8xi32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc36) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc37) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc38) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc39) + %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc40) + %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc40) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc41) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc42) + %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc43) + %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc44) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%_tmp4_11 = %cst_3) -> (tensor<64x8xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc46) + %r0_index_12 = arith.addi %r0_index, %r0_base_9 : tensor<1x8xi32> loc(#loc46) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x8xi32> loc(#loc47) + %tmp0 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc48) + %tmp0_13 = tt.broadcast %r0_index_12 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc49) + %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc49) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<64x8xi32> loc(#loc49) + %tmp0_16 = arith.muli %x1, %cst_0 : tensor<64x1xi32> loc(#loc50) + %tmp0_17 = tt.broadcast %tmp0_16 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc51) + %tmp0_18 = arith.addi %tmp0_15, %tmp0_17 : tensor<64x8xi32> loc(#loc51) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc52) + %tmp0_20 = tt.addptr %tmp0_19, %tmp0_18 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc52) + %tmp0_21 = tt.broadcast %r0_mask : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc53) + %tmp0_22 = tt.load %tmp0_20, %tmp0_21, %cst evictionPolicy = evict_first : tensor<64x8x!tt.ptr> loc(#loc53) + %tmp0_23 = arith.extf %tmp0_22 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc54) + %tmp2 = arith.mulf %tmp0_23, %tmp0_23 : tensor<64x8xf32> loc(#loc55) + %tmp5 = arith.addf %_tmp4_11, %tmp2 : tensor<64x8xf32> loc(#loc56) + %_tmp4_24 = arith.select %tmp0_21, %tmp5, %_tmp4_11 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc57) + scf.yield %_tmp4_24 : tensor<64x8xf32> loc(#loc24) + } loc(#loc45) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_11: f32 loc(callsite(#loc1 at #loc58)), %tmp4_12: f32 loc(callsite(#loc1 at #loc58))): + %tmp4_13 = arith.addf %tmp4_11, %tmp4_12 : f32 loc(#loc62) + tt.reduce.return %tmp4_13 : f32 loc(#loc60) + }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc60) + %tmp4_10 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc59) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc29) + %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc29) + tt.store %1, %tmp4_10 : tensor<64x1x!tt.ptr> loc(#loc30) + tt.return loc(#loc31) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc36 = loc("xoffset"(#loc3)) +#loc37 = loc("xoffset"(#loc4)) +#loc38 = loc("xindex"(#loc5)) +#loc39 = loc("xindex"(#loc6)) +#loc40 = loc("xindex"(#loc7)) +#loc41 = loc("r0_base"(#loc8)) +#loc42 = loc("r0_base"(#loc9)) +#loc43 = loc("x0"(#loc10)) +#loc44 = loc("x1"(#loc11)) +#loc45 = loc("_tmp4"(#loc2)) +#loc46 = loc("r0_index"(#loc12)) +#loc47 = loc("r0_mask"(#loc13)) +#loc48 = loc("tmp0"(#loc14)) +#loc49 = loc("tmp0"(#loc15)) +#loc50 = loc("tmp0"(#loc16)) +#loc51 = loc("tmp0"(#loc17)) +#loc52 = loc("tmp0"(#loc18)) +#loc53 = loc("tmp0"(#loc19)) +#loc54 = loc("tmp0"(#loc20)) +#loc55 = loc("tmp2"(#loc21)) +#loc56 = loc("tmp5"(#loc22)) +#loc57 = loc("_tmp4"(#loc23)) +#loc59 = loc("tmp4"(#loc28)) +#loc60 = loc(callsite(#loc25 at #loc58)) +#loc62 = loc(callsite(#loc27 at #loc60)) diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/__grp__triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..aa72ff5194566353c399d6f451f35fd9ce02e11c --- /dev/null +++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/__grp__triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.json"}} \ No newline at end of file diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..221ae986331cf4f8808c1112afb0012189227783 Binary files /dev/null and b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.cubin differ diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.json b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..142d604f0dcd72975c1451ac2f22c38bfb06fe36 --- /dev/null +++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"hash": "dc06617612a5a0f6f63752ef76fdf496216dbb553b4f3e1b52ab156f05ed066b", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 16, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"} \ No newline at end of file diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.llir b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..f7f4c1002e9e214125923b9c68c6cdbaaebe6748 --- /dev/null +++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.llir @@ -0,0 +1,136 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 2, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 96, !dbg !9 + %11 = lshr exact i32 %10, 5, !dbg !9 + %12 = and i32 %9, 3, !dbg !9 + %13 = or disjoint i32 %11, %8, !dbg !10 + %14 = or disjoint i32 %8, %12, !dbg !10 + %15 = shl nuw nsw i32 %9, 2, !dbg !11 + %16 = and i32 %15, 124, !dbg !11 + %17 = sdiv i32 %13, 32, !dbg !12 + %18 = mul i32 %17, 32, !dbg !13 + %.decomposed = sub i32 %13, %18, !dbg !13 + %19 = shl nsw i32 %.decomposed, 7, !dbg !14 + %20 = or disjoint i32 %19, %16, !dbg !15 + %21 = mul i32 %17, 12288, !dbg !16 + %22 = add i32 %20, %21, !dbg !17 + %23 = sext i32 %22 to i64, !dbg !18 + %24 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18 + %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %24, i64 %25, i1 true) #4, !dbg !19 + %27 = extractvalue { i32, i32 } %26, 0, !dbg !19 + %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !19 + %29 = extractvalue { i32, i32 } %26, 1, !dbg !19 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !19 + %31 = extractelement <2 x bfloat> %28, i64 0, !dbg !19 + %32 = extractelement <2 x bfloat> %28, i64 1, !dbg !19 + %33 = extractelement <2 x bfloat> %30, i64 0, !dbg !19 + %34 = extractelement <2 x bfloat> %30, i64 1, !dbg !19 + %35 = fpext bfloat %31 to float, !dbg !20 + %36 = fpext bfloat %32 to float, !dbg !20 + %37 = fpext bfloat %33 to float, !dbg !20 + %38 = fpext bfloat %34 to float, !dbg !20 + %39 = fmul float %35, %35, !dbg !21 + %40 = fmul float %36, %36, !dbg !21 + %41 = fmul float %37, %37, !dbg !21 + %42 = fmul float %38, %38, !dbg !21 + %43 = fadd float %39, %40, !dbg !22 + %44 = fadd float %41, %43, !dbg !22 + %45 = fadd float %42, %44, !dbg !22 + %46 = bitcast float %45 to i32, !dbg !25 + %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %46, i32 16, i32 31), !dbg !25 + %48 = bitcast i32 %47 to float, !dbg !25 + %49 = fadd float %45, %48, !dbg !22 + %50 = bitcast float %49 to i32, !dbg !25 + %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %50, i32 8, i32 31), !dbg !25 + %52 = bitcast i32 %51 to float, !dbg !25 + %53 = fadd float %49, %52, !dbg !22 + %54 = bitcast float %53 to i32, !dbg !25 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 4, i32 31), !dbg !25 + %56 = bitcast i32 %55 to float, !dbg !25 + %57 = fadd float %53, %56, !dbg !22 + %58 = bitcast float %57 to i32, !dbg !25 + %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %58, i32 2, i32 31), !dbg !25 + %60 = bitcast i32 %59 to float, !dbg !25 + %61 = fadd float %57, %60, !dbg !22 + %62 = bitcast float %61 to i32, !dbg !25 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 1, i32 31), !dbg !25 + %64 = bitcast i32 %63 to float, !dbg !25 + %65 = fadd float %61, %64, !dbg !22 + %66 = lshr exact i32 %10, 3, !dbg !28 + %67 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %66, !dbg !28 + store float %65, ptr addrspace(3) %67, align 4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %68 = shl nuw nsw i32 %12, 2, !dbg !28 + %69 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %68, !dbg !28 + %70 = load i32, ptr addrspace(3) %69, align 4, !dbg !28 + %71 = sext i32 %14 to i64, !dbg !29 + %72 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !29 + %73 = and i32 %9, 124, !dbg !30 + %74 = icmp eq i32 %73, 0, !dbg !30 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %70, ptr addrspace(1) %72, i1 %74) #4, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 41, scope: !4) +!16 = !DILocation(line: 38, column: 56, scope: !4) +!17 = !DILocation(line: 38, column: 50, scope: !4) +!18 = !DILocation(line: 38, column: 34, scope: !4) +!19 = !DILocation(line: 38, column: 61, scope: !4) +!20 = !DILocation(line: 38, column: 115, scope: !4) +!21 = !DILocation(line: 40, column: 22, scope: !4) +!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26) +!26 = !DILocation(line: 44, column: 25, scope: !27) +!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!28 = !DILocation(line: 44, column: 28, scope: !4) +!29 = !DILocation(line: 45, column: 25, scope: !4) +!30 = !DILocation(line: 45, column: 36, scope: !4) +!31 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0433910d2804f7bb4ec0f8846c28cdd91cffe67a --- /dev/null +++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ptx @@ -0,0 +1,506 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_0 +.visible .entry triton_red_fused__fused_rms_norm_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5 +) +.reqntid 128 +{ + .reg .pred %p<3>; + .reg .b16 %rs<5>; + .reg .b32 %r<48>; + .reg .b64 %rd<6>; + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_red_fused__fused_rms_norm_view_0_param_0]; + ld.param.b64 %rd5, [triton_red_fused__fused_rms_norm_view_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28 + mov.u32 %r5, %ctaid.x; + .loc 1 23 33 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33 + shl.b32 %r6, %r5, 2; + .loc 1 24 44 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44 + mov.u32 %r7, %tid.x; + and.b32 %r8, %r7, 96; + bfe.u32 %r9, %r7, 5, 2; + and.b32 %r10, %r7, 3; + .loc 1 24 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23 + or.b32 %r11, %r9, %r6; + or.b32 %r12, %r6, %r10; + .loc 1 26 37 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37 + shl.b32 %r13, %r7, 2; + and.b32 %r14, %r13, 124; + .loc 1 29 19 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19 + bfe.s32 %r15, %r5, 29, 1; + shr.u32 %r16, %r15, 27; + add.s32 %r17, %r11, %r16; + shr.u32 %r18, %r17, 5; + .loc 1 28 19 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:28:19 + and.b32 %r19, %r17, 33554400; + sub.s32 %r20, %r11, %r19; + .loc 1 38 45 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:45 + shl.b32 %r21, %r20, 7; + .loc 1 38 41 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:41 + or.b32 %r22, %r21, %r14; + .loc 1 38 50 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:50 + mad.lo.s32 %r23, %r18, 12288, %r22; + .loc 1 38 34 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34 + mad.wide.s32 %rd1, %r23, 2, %rd4; + .loc 1 38 61 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + cvt.f32.bf16 %r24, %rs1; + cvt.f32.bf16 %r25, %rs2; + cvt.f32.bf16 %r26, %rs3; + cvt.f32.bf16 %r27, %rs4; + .loc 1 40 22 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22 + mul.f32 %r28, %r25, %r25; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + fma.rn.f32 %r29, %r24, %r24, %r28; + fma.rn.f32 %r30, %r26, %r26, %r29; + fma.rn.f32 %r31, %r27, %r27, %r30; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r32, %r31, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r33, %r31, %r32; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r34, %r33, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r35, %r33, %r34; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r36, %r35, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r37, %r35, %r36; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r39, %r37, %r38; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r40, %r39, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r41, %r39, %r40; +$L__tmp12: + .loc 1 44 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28 + shr.u32 %r42, %r8, 3; + mov.b32 %r43, global_smem; + add.s32 %r44, %r43, %r42; + st.shared.b32 [%r44], %r41; + bar.sync 0; + shl.b32 %r45, %r10, 2; + add.s32 %r46, %r43, %r45; + ld.shared.b32 %r4, [%r46]; + .loc 1 45 25 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25 + mad.wide.s32 %rd3, %r12, 4, %rd5; + .loc 1 45 36 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36 + and.b32 %r47, %r7, 124; + setp.eq.b32 %p2, %r47, 0; + // begin inline asm + @%p2 st.global.b32 [ %rd3 + 0 ], { %r4 }; + // end inline asm + .loc 1 45 4 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4 + ret; +$L__tmp13: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 118 +.b8 121 +.b8 116 +.b8 52 +.b8 50 +.b8 55 +.b8 51 +.b8 105 +.b8 117 +.b8 51 +.b8 51 +.b8 109 +.b8 112 +.b8 101 +.b8 101 +.b8 55 +.b8 104 +.b8 98 +.b8 101 +.b8 116 +.b8 53 +.b8 106 +.b8 53 +.b8 101 +.b8 113 +.b8 52 +.b8 52 +.b8 100 +.b8 54 +.b8 102 +.b8 115 +.b8 104 +.b8 103 +.b8 119 +.b8 107 +.b8 121 +.b8 120 +.b8 107 +.b8 110 +.b8 53 +.b8 50 +.b8 103 +.b8 103 +.b8 103 +.b8 107 +.b8 105 +.b8 113 +.b8 104 +.b8 106 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 119 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.source b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..11bcf95e8dcbe462f3479afecf407c7871e0022c --- /dev/null +++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 8192 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 4 : i32 loc(#loc49) + %xoffset_3 = arith.constant 4 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<4x128xi1> loc(#loc53) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<4x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<4x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c128_i32 = arith.constant 128 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<4x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<4x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<4x128xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<4x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<4x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<4x128xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<4x128xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<4x128xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<4x128xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<4x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<4x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x128xf32> loc("input"(#loc33))) -> tensor<4xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc34) + tt.return %0 : tensor<4xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4xf32> loc(#loc37) + tt.return %1 : tensor<4xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fa653a4402fd3bb1e45f866eca545bb4cba8c417 --- /dev/null +++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttgir @@ -0,0 +1,108 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc1 = loc(unknown) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc27 = loc("in_ptr0"(#loc)) +#loc28 = loc("out_ptr0"(#loc)) +#loc29 = loc("xnumel"(#loc)) +#loc30 = loc("r0_numel"(#loc)) +#loc49 = loc("tmp4"(#loc21)) +#loc52 = loc(callsite(#loc1 at #loc49)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<4x1xi32, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc31) + %xoffset_5 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc32) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33) + %xindex_6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc33) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc33) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<4x1xi32, #blocked> loc(#loc34) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc34) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<4x1xi32, #blocked> loc(#loc34) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<4x1xi32, #blocked1> loc(#loc34) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35) + %x0 = arith.remsi %xindex_11, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc36) + %x1 = arith.divsi %xindex_11, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc37) + %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38) + %tmp0 = arith.muli %x0, %cst_0 : tensor<4x1xi32, #blocked> loc(#loc39) + %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc40) + %tmp0_15 = tt.broadcast %tmp0 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc40) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<4x128xi32, #blocked> loc(#loc40) + %tmp0_17 = arith.muli %x1, %cst_1 : tensor<4x1xi32, #blocked> loc(#loc41) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc42) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<4x128xi32, #blocked> loc(#loc42) + %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked> loc(#loc43) + %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc43) + %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc44) + %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<4x128x!tt.ptr, #blocked> loc(#loc44) + %tmp0_24 = arith.extf %tmp0_23 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc45) + %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<4x128xf32, #blocked> loc(#loc46) + %tmp5 = arith.addf %tmp2, %cst_4 : tensor<4x128xf32, #blocked> loc(#loc47) + %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc48) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))): + %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53) + tt.reduce.return %tmp4_29 : f32 loc(#loc51) + }) : (tensor<4x128xf32, #blocked>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51) + %tmp4_25 = ttg.convert_layout %tmp4 : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50) + %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc50) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<4x1x!tt.ptr, #blocked1> loc(#loc24) + %1 = tt.addptr %0, %xindex_12 : tensor<4x1x!tt.ptr, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc24) + tt.store %1, %tmp4_26 : tensor<4x1x!tt.ptr, #blocked1> loc(#loc25) + tt.return loc(#loc26) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc31 = loc("xoffset"(#loc2)) +#loc32 = loc("xoffset"(#loc3)) +#loc33 = loc("xindex"(#loc4)) +#loc34 = loc("xindex"(#loc5)) +#loc35 = loc("r0_base"(#loc6)) +#loc36 = loc("x0"(#loc7)) +#loc37 = loc("x1"(#loc8)) +#loc38 = loc("r0_mask"(#loc9)) +#loc39 = loc("tmp0"(#loc10)) +#loc40 = loc("tmp0"(#loc11)) +#loc41 = loc("tmp0"(#loc12)) +#loc42 = loc("tmp0"(#loc13)) +#loc43 = loc("tmp0"(#loc14)) +#loc44 = loc("tmp0"(#loc15)) +#loc45 = loc("tmp0"(#loc16)) +#loc46 = loc("tmp2"(#loc17)) +#loc47 = loc("tmp5"(#loc18)) +#loc48 = loc("_tmp4"(#loc19)) +#loc50 = loc("tmp4"(#loc23)) +#loc51 = loc(callsite(#loc20 at #loc49)) +#loc53 = loc(callsite(#loc22 at #loc51)) diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..08d5b5e90a3b6a75505d5fc4ca7ec67469f9f082 --- /dev/null +++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttir @@ -0,0 +1,105 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc2 = loc(unknown) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc29 = loc("in_ptr0"(#loc)) +#loc30 = loc("out_ptr0"(#loc)) +#loc31 = loc("xnumel"(#loc)) +#loc32 = loc("r0_numel"(#loc)) +#loc53 = loc("tmp4"(#loc23)) +#loc56 = loc(callsite(#loc2 at #loc53)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16> loc(#loc33) + %cst = arith.constant dense<12288> : tensor<4x1xi32> loc(#loc2) + %cst_0 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc2) + %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc2) + %cst_3 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc2) + %c4_i32 = arith.constant 4 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_4 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc36) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc37) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc38) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc38) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40) + %x0 = arith.remsi %xindex_7, %cst_3 : tensor<4x1xi32> loc(#loc41) + %x1 = arith.divsi %xindex_7, %cst_3 : tensor<4x1xi32> loc(#loc42) + %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43) + %tmp0_9 = arith.muli %x0, %cst_0 : tensor<4x1xi32> loc(#loc44) + %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc45) + %tmp0_11 = tt.broadcast %tmp0_9 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc45) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<4x128xi32> loc(#loc45) + %tmp0_13 = arith.muli %x1, %cst : tensor<4x1xi32> loc(#loc46) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc47) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<4x128xi32> loc(#loc47) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc48) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc48) + %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc33) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc33) + %tmp0_20 = arith.extf %tmp0_19 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc49) + %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<4x128xf32> loc(#loc50) + %tmp5 = arith.addf %tmp2, %cst_2 : tensor<4x128xf32> loc(#loc51) + %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc52) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))): + %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57) + tt.reduce.return %tmp4_24 : f32 loc(#loc55) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc55) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc54) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<4x1x!tt.ptr> loc(#loc26) + %1 = tt.addptr %0, %xindex_7 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> loc(#loc26) + tt.store %1, %tmp4_21 : tensor<4x1x!tt.ptr> loc(#loc27) + tt.return loc(#loc28) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc33 = loc("tmp0"(#loc1)) +#loc34 = loc("xoffset"(#loc3)) +#loc35 = loc("xoffset"(#loc4)) +#loc36 = loc("xindex"(#loc5)) +#loc37 = loc("xindex"(#loc6)) +#loc38 = loc("xindex"(#loc7)) +#loc39 = loc("r0_base"(#loc8)) +#loc40 = loc("r0_base"(#loc9)) +#loc41 = loc("x0"(#loc10)) +#loc42 = loc("x1"(#loc11)) +#loc43 = loc("r0_mask"(#loc12)) +#loc44 = loc("tmp0"(#loc13)) +#loc45 = loc("tmp0"(#loc14)) +#loc46 = loc("tmp0"(#loc15)) +#loc47 = loc("tmp0"(#loc16)) +#loc48 = loc("tmp0"(#loc17)) +#loc49 = loc("tmp0"(#loc18)) +#loc50 = loc("tmp2"(#loc19)) +#loc51 = loc("tmp5"(#loc20)) +#loc52 = loc("_tmp4"(#loc21)) +#loc54 = loc("tmp4"(#loc25)) +#loc55 = loc(callsite(#loc22 at #loc53)) +#loc57 = loc(callsite(#loc24 at #loc55)) diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..25318e6400bd1c4b0e1aaf27589233ff37ad1a92 --- /dev/null +++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}} \ No newline at end of file diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..2b27b3f9e66b184f1be571893b39afec38640c1e Binary files /dev/null and b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..90135d9a1efcdd6bf9b0b02cbdf870abfafe1b07 --- /dev/null +++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"hash": "dc21aaa9e2fe6753d9008bbdbf3b92b903834509ab1bb88d548bc60a117f9b01", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"} \ No newline at end of file diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..16ab18a45fdcb649f4934668d4f5300ccfc3a884 --- /dev/null +++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir @@ -0,0 +1,464 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %13 = shl i32 %12, 6, !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %15 = and i32 %14, 252, !dbg !10 + %16 = lshr exact i32 %15, 2, !dbg !10 + %17 = or disjoint i32 %16, %13, !dbg !11 + %18 = and i32 %14, 3, !dbg !12 + %19 = sdiv i32 %17, 32, !dbg !13 + %20 = shl i32 %17, 7 + %21 = shl i32 %19, 15 + %22 = add i32 %21, %20 + %23 = add i32 %22, 4096 + %24 = zext nneg i32 %18 to i64, !dbg !14 + br label %25, !dbg !14 + +25: ; preds = %11, %25 + %indvars.iv = phi i64 [ 0, %11 ], [ %indvars.iv.next, %25 ] + %26 = phi float [ 0.000000e+00, %11 ], [ %47, %25 ] + %27 = phi float [ 0.000000e+00, %11 ], [ %45, %25 ] + %28 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !15 + %29 = or disjoint i32 %18, %28, !dbg !15 + %30 = add i32 %23, %29, !dbg !15 + %31 = sext i32 %30 to i64, !dbg !16 + %32 = getelementptr bfloat, ptr addrspace(1) %2, i64 %31, !dbg !16 + %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %34 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %32, i64 %33, i1 true) #6, !dbg !17 + %35 = bitcast i16 %34 to bfloat, !dbg !17 + %36 = fpext bfloat %35 to float, !dbg !18 + %37 = add i32 %22, %29, !dbg !19 + %38 = sext i32 %37 to i64, !dbg !20 + %39 = getelementptr bfloat, ptr addrspace(1) %2, i64 %38, !dbg !20 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %41 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %39, i64 %40, i1 true) #6, !dbg !21 + %42 = bitcast i16 %41 to bfloat, !dbg !21 + %43 = fpext bfloat %42 to float, !dbg !22 + %44 = fmul float %36, %36, !dbg !23 + %45 = fadd float %27, %44, !dbg !24 + %46 = fmul float %43, %43, !dbg !25 + %47 = fadd float %26, %46, !dbg !26 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !14 + %48 = icmp samesign ult i64 %indvars.iv, 124, !dbg !14 + br i1 %48, label %25, label %49, !dbg !14 + +49: ; preds = %25 + %50 = and i32 %14, 63, !dbg !10 + %51 = or disjoint i32 %13, %50, !dbg !11 + %52 = and i32 %14, 192, !dbg !12 + %53 = lshr exact i32 %52, 6, !dbg !12 + %54 = sdiv i32 %51, 32, !dbg !13 + %55 = bitcast float %45 to i32, !dbg !27 + %56 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 2, i32 31), !dbg !27 + %57 = bitcast i32 %56 to float, !dbg !27 + %58 = fadd float %45, %57, !dbg !32 + %59 = bitcast float %58 to i32, !dbg !27 + %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 1, i32 31), !dbg !27 + %61 = bitcast i32 %60 to float, !dbg !27 + %62 = fadd float %58, %61, !dbg !32 + %63 = bitcast float %47 to i32, !dbg !33 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !33 + %65 = bitcast i32 %64 to float, !dbg !33 + %66 = fadd float %47, %65, !dbg !35 + %67 = bitcast float %66 to i32, !dbg !33 + %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 1, i32 31), !dbg !33 + %69 = bitcast i32 %68 to float, !dbg !33 + %70 = fadd float %66, %69, !dbg !35 + %71 = shl i32 %19, 7, !dbg !36 + %72 = tail call float @llvm.nvvm.div.full(float %70, float 1.280000e+02), !dbg !37 + %73 = fadd float %72, 0x3EB0C6F7A0000000, !dbg !38 + %74 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !39 + %.not.i = icmp eq i32 %74, 0, !dbg !39 + br i1 %.not.i, label %77, label %75, !dbg !39 + +75: ; preds = %49 + %76 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %73), !dbg !39 + br label %__nv_rsqrtf.exit, !dbg !39 + +77: ; preds = %49 + %78 = tail call float @llvm.nvvm.rsqrt.approx.f(float %73), !dbg !39 + br label %__nv_rsqrtf.exit, !dbg !39 + +__nv_rsqrtf.exit: ; preds = %75, %77 + %.0.i = phi float [ %76, %75 ], [ %78, %77 ], !dbg !39 + %79 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %15, !dbg !40 + store float %.0.i, ptr addrspace(3) %79, align 4, !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40 + %80 = shl nuw nsw i32 %50, 2, !dbg !40 + %81 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %80, !dbg !40 + %82 = load float, ptr addrspace(3) %81, align 4, !dbg !40 + %83 = tail call float @llvm.nvvm.div.full(float %62, float 1.280000e+02), !dbg !41 + %84 = fadd float %83, 0x3EB0C6F7A0000000, !dbg !42 + %85 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !43 + %.not.i2 = icmp eq i32 %85, 0, !dbg !43 + br i1 %.not.i2, label %88, label %86, !dbg !43 + +86: ; preds = %__nv_rsqrtf.exit + %87 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %84), !dbg !43 + br label %__nv_rsqrtf.exit4, !dbg !43 + +88: ; preds = %__nv_rsqrtf.exit + %89 = tail call float @llvm.nvvm.rsqrt.approx.f(float %84), !dbg !43 + br label %__nv_rsqrtf.exit4, !dbg !43 + +__nv_rsqrtf.exit4: ; preds = %86, %88 + %.0.i3 = phi float [ %87, %86 ], [ %89, %88 ], !dbg !43 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44 + store float %.0.i3, ptr addrspace(3) %79, align 4, !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44 + %90 = load float, ptr addrspace(3) %81, align 4, !dbg !44 + %91 = shl i32 %17, 7, !dbg !45 + %92 = and i32 %53, 1 + %.masked = and i32 %53, 2 + %93 = shl nuw nsw i32 %18, 5 + %94 = and i32 %14, 96 + %95 = shl nuw nsw i32 %94, 3 + %96 = or disjoint i32 %93, %95 + %97 = xor i32 %96, %15 + %98 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %97 + %99 = and i32 %14, 24 + %100 = shl nuw nsw i32 %99, 5 + %101 = lshr exact i32 %52, 1 + %102 = or disjoint i32 %100, %80 + %103 = xor i32 %102, %101 + %104 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %103 + %105 = icmp eq i32 %92, 0 + %106 = shl i32 %51, 7 + %107 = shl i32 %54, 15 + %108 = add i32 %107, %106 + %109 = icmp ne i32 %92, 0 + %110 = add i32 %108, 4097 + %111 = add i32 %108, 4096 + %112 = shl nuw nsw i32 %99, 4 + %113 = shl nuw nsw i32 %14, 2 + %114 = and i32 %113, 124 + %115 = lshr i32 %14, 4 + %116 = and i32 %115, 2 + %117 = or disjoint i32 %112, %114 + %118 = xor i32 %117, %101 + %119 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %116 + %120 = getelementptr inbounds nuw i8, ptr addrspace(3) %119, i32 %118 + %121 = shl nuw nsw i32 %94, 2 + %122 = and i32 %14, 124 + %123 = lshr i32 %14, 6 + %124 = and i32 %123, 2 + %125 = or disjoint i32 %93, %121 + %126 = xor i32 %125, %122 + %127 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %124 + %128 = getelementptr inbounds nuw i8, ptr addrspace(3) %127, i32 %126 + %129 = zext nneg i32 %.masked to i64, !dbg !46 + %130 = sext i32 %71 to i64, !dbg !46 + %131 = sext i32 %91 to i64, !dbg !46 + br label %132, !dbg !46 + +132: ; preds = %__nv_rsqrtf.exit4, %132 + %indvars.iv7 = phi i64 [ 0, %__nv_rsqrtf.exit4 ], [ %indvars.iv.next8, %132 ] + %133 = or disjoint i64 %indvars.iv7, %24, !dbg !47 + %134 = or disjoint i64 %indvars.iv7, %129, !dbg !48 + %135 = trunc nuw nsw i64 %133 to i32, !dbg !49 + %136 = add i32 %22, %135, !dbg !49 + %137 = sext i32 %136 to i64, !dbg !50 + %138 = getelementptr bfloat, ptr addrspace(1) %2, i64 %137, !dbg !50 + %139 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %140 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %138, i64 %139, i1 true) #6, !dbg !51 + %141 = bitcast i16 %140 to bfloat, !dbg !51 + %142 = fpext bfloat %141 to float, !dbg !52 + %143 = getelementptr bfloat, ptr addrspace(1) %3, i64 %133, !dbg !53 + %144 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !54 + %145 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %143, i64 %144, i1 true) #6, !dbg !54 + %146 = bitcast i16 %145 to bfloat, !dbg !54 + %147 = fpext bfloat %146 to float, !dbg !55 + %148 = add nuw nsw i64 %133, %130, !dbg !56 + %149 = getelementptr float, ptr addrspace(1) %4, i64 %148, !dbg !57 + %150 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %151 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %149, i64 %150, i1 true) #6, !dbg !58 + %152 = bitcast i32 %151 to float, !dbg !58 + %153 = getelementptr float, ptr addrspace(1) %5, i64 %148, !dbg !59 + %154 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !60 + %155 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %153, i64 %154, i1 true) #6, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !60 + %156 = insertelement <1 x i32> poison, i32 %155, i64 0, !dbg !60 + store <1 x i32> %156, ptr addrspace(3) %98, align 4, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !60 + %157 = load float, ptr addrspace(3) %104, align 4, !dbg !60 + %158 = add i32 %23, %135, !dbg !61 + %159 = sext i32 %158 to i64, !dbg !62 + %160 = getelementptr bfloat, ptr addrspace(1) %2, i64 %159, !dbg !62 + %161 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !63 + %162 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %160, i64 %161, i1 true) #6, !dbg !63 + %163 = bitcast i16 %162 to bfloat, !dbg !63 + %164 = fpext bfloat %163 to float, !dbg !64 + %165 = getelementptr bfloat, ptr addrspace(1) %6, i64 %133, !dbg !65 + %166 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !66 + %167 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %165, i64 %166, i1 true) #6, !dbg !66 + %168 = bitcast i16 %167 to bfloat, !dbg !66 + %169 = fpext bfloat %168 to float, !dbg !67 + %170 = or disjoint i64 %134, 1, !dbg !68 + %171 = trunc nuw nsw i64 %170 to i32, !dbg !69 + %172 = add i32 %108, %171, !dbg !69 + %173 = sext i32 %172 to i64, !dbg !70 + %174 = getelementptr bfloat, ptr addrspace(1) %2, i64 %173, !dbg !70 + %175 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !71 + %176 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %174, i64 %175, i1 %105) #6, !dbg !71 + %177 = bitcast i16 %176 to bfloat, !dbg !71 + %178 = fpext bfloat %177 to float, !dbg !72 + %179 = fmul float %82, %178, !dbg !40 + %180 = getelementptr bfloat, ptr addrspace(1) %3, i64 %170, !dbg !73 + %181 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !74 + %182 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %180, i64 %181, i1 %105) #6, !dbg !74 + %183 = bitcast i16 %182 to bfloat, !dbg !74 + %184 = fpext bfloat %183 to float, !dbg !75 + %185 = fmul float %179, %184, !dbg !76 + %186 = fsub float 0.000000e+00, %185, !dbg !77 + %187 = trunc nuw nsw i64 %134 to i32, !dbg !78 + %188 = add i32 %108, %187, !dbg !78 + %189 = sext i32 %188 to i64, !dbg !79 + %190 = getelementptr bfloat, ptr addrspace(1) %2, i64 %189, !dbg !79 + %191 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !80 + %192 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %190, i64 %191, i1 %109) #6, !dbg !80 + %193 = bitcast i16 %192 to bfloat, !dbg !80 + %194 = fpext bfloat %193 to float, !dbg !81 + %195 = fmul float %82, %194, !dbg !82 + %196 = getelementptr bfloat, ptr addrspace(1) %3, i64 %134, !dbg !83 + %197 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !84 + %198 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %196, i64 %197, i1 %109) #6, !dbg !84 + %199 = bitcast i16 %198 to bfloat, !dbg !84 + %200 = fpext bfloat %199 to float, !dbg !85 + %201 = fmul float %195, %200, !dbg !86 + %202 = select i1 %105, float %186, float %201, !dbg !87 + %203 = fmul float %.0.i, %142, !dbg !88 + %204 = fmul float %203, %147, !dbg !89 + %205 = fmul float %204, %152, !dbg !90 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !90 + store float %205, ptr addrspace(3) %98, align 4, !dbg !90 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !90 + %206 = load float, ptr addrspace(3) %104, align 4, !dbg !90 + %207 = fmul float %157, %202, !dbg !91 + %208 = fadd float %206, %207, !dbg !92 + %209 = add i32 %110, %187, !dbg !93 + %210 = sext i32 %209 to i64, !dbg !94 + %211 = getelementptr bfloat, ptr addrspace(1) %2, i64 %210, !dbg !94 + %212 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !95 + %213 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %211, i64 %212, i1 %105) #6, !dbg !95 + %214 = bitcast i16 %213 to bfloat, !dbg !95 + %215 = fpext bfloat %214 to float, !dbg !96 + %216 = fmul float %90, %215, !dbg !44 + %217 = getelementptr bfloat, ptr addrspace(1) %6, i64 %170, !dbg !97 + %218 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !98 + %219 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %217, i64 %218, i1 %105) #6, !dbg !98 + %220 = bitcast i16 %219 to bfloat, !dbg !98 + %221 = fpext bfloat %220 to float, !dbg !99 + %222 = fmul float %216, %221, !dbg !100 + %223 = fsub float 0.000000e+00, %222, !dbg !101 + %224 = add i32 %111, %187, !dbg !102 + %225 = sext i32 %224 to i64, !dbg !103 + %226 = getelementptr bfloat, ptr addrspace(1) %2, i64 %225, !dbg !103 + %227 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104 + %228 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %226, i64 %227, i1 %109) #6, !dbg !104 + %229 = bitcast i16 %228 to bfloat, !dbg !104 + %230 = fpext bfloat %229 to float, !dbg !105 + %231 = fmul float %90, %230, !dbg !106 + %232 = getelementptr bfloat, ptr addrspace(1) %6, i64 %134, !dbg !107 + %233 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108 + %234 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %232, i64 %233, i1 %109) #6, !dbg !108 + %235 = bitcast i16 %234 to bfloat, !dbg !108 + %236 = fpext bfloat %235 to float, !dbg !109 + %237 = fmul float %231, %236, !dbg !110 + %238 = select i1 %105, float %223, float %237, !dbg !87 + %239 = fmul float %.0.i3, %164, !dbg !111 + %240 = fmul float %239, %169, !dbg !112 + %241 = fmul float %240, %152, !dbg !113 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + store float %241, ptr addrspace(3) %98, align 4, !dbg !113 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + %242 = load float, ptr addrspace(3) %104, align 4, !dbg !113 + %243 = fmul float %157, %238, !dbg !114 + %244 = fadd float %242, %243, !dbg !115 + %245 = add nuw nsw i64 %133, %131, !dbg !116 + %246 = getelementptr bfloat, ptr addrspace(1) %0, i64 %245, !dbg !117 + %247 = fptrunc float %208 to bfloat, !dbg !118 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !118 + store bfloat %247, ptr addrspace(3) %120, align 2, !dbg !118 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !118 + %248 = load i16, ptr addrspace(3) %128, align 2, !dbg !118 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %248, ptr addrspace(1) %246, i1 true) #6, !dbg !118 + %249 = getelementptr bfloat, ptr addrspace(1) %1, i64 %245, !dbg !119 + %250 = fptrunc float %244 to bfloat, !dbg !120 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !120 + store bfloat %250, ptr addrspace(3) %120, align 2, !dbg !120 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !120 + %251 = load i16, ptr addrspace(3) %128, align 2, !dbg !120 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %251, ptr addrspace(1) %249, i1 true) #6, !dbg !120 + %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 4, !dbg !46 + %252 = icmp samesign ult i64 %indvars.iv7, 124, !dbg !46 + br i1 %252, label %132, label %253, !dbg !46 + +253: ; preds = %132 + ret void, !dbg !121 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 23, column: 33, scope: !5) +!10 = !DILocation(line: 24, column: 44, scope: !5) +!11 = !DILocation(line: 24, column: 23, scope: !5) +!12 = !DILocation(line: 26, column: 37, scope: !5) +!13 = !DILocation(line: 29, column: 19, scope: !5) +!14 = !DILocation(line: 33, column: 43, scope: !5) +!15 = !DILocation(line: 39, column: 57, scope: !5) +!16 = !DILocation(line: 39, column: 34, scope: !5) +!17 = !DILocation(line: 39, column: 68, scope: !5) +!18 = !DILocation(line: 39, column: 121, scope: !5) +!19 = !DILocation(line: 40, column: 50, scope: !5) +!20 = !DILocation(line: 40, column: 34, scope: !5) +!21 = !DILocation(line: 40, column: 61, scope: !5) +!22 = !DILocation(line: 40, column: 114, scope: !5) +!23 = !DILocation(line: 42, column: 22, scope: !5) +!24 = !DILocation(line: 44, column: 23, scope: !5) +!25 = !DILocation(line: 47, column: 22, scope: !5) +!26 = !DILocation(line: 49, column: 25, scope: !5) +!27 = !DILocation(line: 293, column: 36, scope: !28, inlinedAt: !30) +!28 = distinct !DILexicalBlockFile(scope: !5, file: !29, discriminator: 0) +!29 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!30 = !DILocation(line: 51, column: 25, scope: !31) +!31 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!32 = !DILocation(line: 263, column: 15, scope: !28, inlinedAt: !27) +!33 = !DILocation(line: 293, column: 36, scope: !28, inlinedAt: !34) +!34 = !DILocation(line: 52, column: 27, scope: !31) +!35 = !DILocation(line: 263, column: 15, scope: !28, inlinedAt: !33) +!36 = !DILocation(line: 63, column: 46, scope: !5) +!37 = !DILocation(line: 75, column: 25, scope: !5) +!38 = !DILocation(line: 77, column: 24, scope: !5) +!39 = !DILocation(line: 78, column: 32, scope: !5) +!40 = !DILocation(line: 79, column: 24, scope: !5) +!41 = !DILocation(line: 123, column: 24, scope: !5) +!42 = !DILocation(line: 124, column: 24, scope: !5) +!43 = !DILocation(line: 125, column: 32, scope: !5) +!44 = !DILocation(line: 126, column: 24, scope: !5) +!45 = !DILocation(line: 161, column: 43, scope: !5) +!46 = !DILocation(line: 53, column: 43, scope: !5) +!47 = !DILocation(line: 54, column: 31, scope: !5) +!48 = !DILocation(line: 72, column: 41, scope: !5) +!49 = !DILocation(line: 61, column: 51, scope: !5) +!50 = !DILocation(line: 61, column: 35, scope: !5) +!51 = !DILocation(line: 61, column: 62, scope: !5) +!52 = !DILocation(line: 61, column: 115, scope: !5) +!53 = !DILocation(line: 62, column: 35, scope: !5) +!54 = !DILocation(line: 62, column: 42, scope: !5) +!55 = !DILocation(line: 62, column: 95, scope: !5) +!56 = !DILocation(line: 63, column: 42, scope: !5) +!57 = !DILocation(line: 63, column: 35, scope: !5) +!58 = !DILocation(line: 63, column: 51, scope: !5) +!59 = !DILocation(line: 64, column: 35, scope: !5) +!60 = !DILocation(line: 64, column: 51, scope: !5) +!61 = !DILocation(line: 65, column: 58, scope: !5) +!62 = !DILocation(line: 65, column: 35, scope: !5) +!63 = !DILocation(line: 65, column: 69, scope: !5) +!64 = !DILocation(line: 65, column: 123, scope: !5) +!65 = !DILocation(line: 66, column: 36, scope: !5) +!66 = !DILocation(line: 66, column: 43, scope: !5) +!67 = !DILocation(line: 66, column: 96, scope: !5) +!68 = !DILocation(line: 72, column: 39, scope: !5) +!69 = !DILocation(line: 72, column: 57, scope: !5) +!70 = !DILocation(line: 72, column: 35, scope: !5) +!71 = !DILocation(line: 72, column: 68, scope: !5) +!72 = !DILocation(line: 72, column: 129, scope: !5) +!73 = !DILocation(line: 80, column: 35, scope: !5) +!74 = !DILocation(line: 80, column: 85, scope: !5) +!75 = !DILocation(line: 80, column: 146, scope: !5) +!76 = !DILocation(line: 82, column: 24, scope: !5) +!77 = !DILocation(line: 84, column: 17, scope: !5) +!78 = !DILocation(line: 90, column: 53, scope: !5) +!79 = !DILocation(line: 90, column: 35, scope: !5) +!80 = !DILocation(line: 90, column: 64, scope: !5) +!81 = !DILocation(line: 90, column: 125, scope: !5) +!82 = !DILocation(line: 97, column: 24, scope: !5) +!83 = !DILocation(line: 98, column: 35, scope: !5) +!84 = !DILocation(line: 98, column: 81, scope: !5) +!85 = !DILocation(line: 98, column: 142, scope: !5) +!86 = !DILocation(line: 100, column: 24, scope: !5) +!87 = !DILocation(line: 0, scope: !5) +!88 = !DILocation(line: 111, column: 24, scope: !5) +!89 = !DILocation(line: 113, column: 24, scope: !5) +!90 = !DILocation(line: 116, column: 24, scope: !5) +!91 = !DILocation(line: 118, column: 24, scope: !5) +!92 = !DILocation(line: 119, column: 24, scope: !5) +!93 = !DILocation(line: 121, column: 60, scope: !5) +!94 = !DILocation(line: 121, column: 35, scope: !5) +!95 = !DILocation(line: 121, column: 71, scope: !5) +!96 = !DILocation(line: 121, column: 132, scope: !5) +!97 = !DILocation(line: 127, column: 35, scope: !5) +!98 = !DILocation(line: 127, column: 85, scope: !5) +!99 = !DILocation(line: 127, column: 146, scope: !5) +!100 = !DILocation(line: 129, column: 24, scope: !5) +!101 = !DILocation(line: 131, column: 17, scope: !5) +!102 = !DILocation(line: 134, column: 60, scope: !5) +!103 = !DILocation(line: 134, column: 35, scope: !5) +!104 = !DILocation(line: 134, column: 71, scope: !5) +!105 = !DILocation(line: 134, column: 132, scope: !5) +!106 = !DILocation(line: 139, column: 24, scope: !5) +!107 = !DILocation(line: 140, column: 35, scope: !5) +!108 = !DILocation(line: 140, column: 81, scope: !5) +!109 = !DILocation(line: 140, column: 142, scope: !5) +!110 = !DILocation(line: 142, column: 24, scope: !5) +!111 = !DILocation(line: 151, column: 25, scope: !5) +!112 = !DILocation(line: 153, column: 26, scope: !5) +!113 = !DILocation(line: 156, column: 26, scope: !5) +!114 = !DILocation(line: 158, column: 26, scope: !5) +!115 = !DILocation(line: 159, column: 26, scope: !5) +!116 = !DILocation(line: 161, column: 39, scope: !5) +!117 = !DILocation(line: 161, column: 32, scope: !5) +!118 = !DILocation(line: 161, column: 55, scope: !5) +!119 = !DILocation(line: 162, column: 32, scope: !5) +!120 = !DILocation(line: 162, column: 56, scope: !5) +!121 = !DILocation(line: 53, column: 4, scope: !5) diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c99b13cafe49c2d9fe1eb750434114ea01398a2b --- /dev/null +++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx @@ -0,0 +1,956 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10 +) +.reqntid 256 +{ + .reg .pred %p<7>; + .reg .b16 %rs<21>; + .reg .b32 %r<139>; + .reg .b64 %rd<67>; + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 + +// %bb.0: + ld.param.b64 %rd16, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6]; + ld.param.b64 %rd15, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5]; + ld.param.b64 %rd14, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4]; + ld.param.b64 %rd13, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3]; + ld.param.b64 %rd12, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2]; + ld.param.b64 %rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1]; + ld.param.b64 %rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28 + mov.u32 %r16, %ctaid.x; + .loc 1 23 33 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33 + shl.b32 %r1, %r16, 6; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 252; + bfe.u32 %r17, %r2, 2, 6; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r18, %r17, %r1; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + and.b32 %r19, %r2, 3; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + bfe.s32 %r20, %r16, 25, 1; + shr.u32 %r21, %r20, 27; + add.s32 %r22, %r18, %r21; + shr.s32 %r4, %r22, 5; + shl.b32 %r23, %r4, 15; + .loc 1 33 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43 + cvt.u64.u32 %rd1, %r19; + shl.b32 %r5, %r16, 13; + add.s32 %r24, %r23, %r5; + shl.b32 %r6, %r17, 7; + or.b32 %r25, %r24, %r6; + or.b32 %r26, %r25, %r19; + cvt.u64.u32 %rd2, %r26; + mov.b32 %r137, 0f00000000; + mov.b64 %rd62, -4; + mov.b32 %r138, %r137; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + add.s64 %rd21, %rd2, %rd62; + cvt.u32.u64 %r27, %rd21; + add.s32 %r28, %r27, 4100; + mad.wide.s32 %rd18, %r28, 2, %rd12; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + mov.b16 %rs2, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd18 + 0 ], %rd17; + // end inline asm + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r29, %rs1; + add.s32 %r30, %r27, 4; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + mad.wide.s32 %rd20, %r30, 2, %rd12; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd20 + 0 ], %rd19; + // end inline asm + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r31, %rs3; + .loc 1 44 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:44:23 + fma.rn.f32 %r138, %r29, %r29, %r138; + .loc 1 49 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25 + fma.rn.f32 %r137, %r31, %r31, %r137; + .loc 1 33 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43 + add.s64 %rd62, %rd62, 4; + setp.lt.u64 %p2, %rd62, 124; + @%p2 bra $L__BB0_1; +// %bb.2: // %__nv_rsqrtf.exit + .loc 1 0 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43 + cvt.u32.u64 %r32, %rd1; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + and.b32 %r33, %r2, 63; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r34, %r1, %r33; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + and.b32 %r35, %r2, 192; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + shr.s32 %r36, %r1, 31; + shr.u32 %r37, %r36, 27; + add.s32 %r38, %r34, %r37; +$L__tmp1: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r39, %r138, 2, 31, -1; +$L__tmp2: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r40, %r138, %r39; +$L__tmp3: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r41, %r40, 1, 31, -1; +$L__tmp4: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r42, %r40, %r41; +$L__tmp5: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r43, %r137, 2, 31, -1; +$L__tmp6: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r44, %r137, %r43; +$L__tmp7: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r45, %r44, 1, 31, -1; +$L__tmp8: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r46, %r44, %r45; +$L__tmp9: + .loc 1 63 46 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46 + shl.b32 %r47, %r4, 7; + mov.b32 %r48, 0f43000000; + .loc 1 75 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25 + div.full.f32 %r49, %r46, %r48; + .loc 1 77 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24 + add.f32 %r50, %r49, 0f358637BD; + .loc 1 78 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32 + rsqrt.approx.ftz.f32 %r7, %r50; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mov.b32 %r51, global_smem; + add.s32 %r52, %r51, %r3; + st.shared.b32 [%r52], %r7; + bar.sync 0; + shl.b32 %r53, %r33, 2; + add.s32 %r54, %r51, %r53; + ld.shared.b32 %r8, [%r54]; + .loc 1 123 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24 + div.full.f32 %r55, %r42, %r48; + .loc 1 124 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24 + add.f32 %r56, %r55, 0f358637BD; + .loc 1 125 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32 + rsqrt.approx.ftz.f32 %r9, %r56; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + bar.sync 0; + st.shared.b32 [%r52], %r9; + bar.sync 0; + ld.shared.b32 %r10, [%r54]; + bfe.u32 %r11, %r35, 6, 1; + shl.b32 %r57, %r32, 5; + and.b32 %r58, %r2, 96; + shl.b32 %r59, %r58, 3; + or.b32 %r60, %r57, %r59; + xor.b32 %r61, %r60, %r3; + add.s32 %r12, %r51, %r61; + and.b32 %r62, %r2, 24; + shl.b32 %r63, %r62, 5; + shr.u32 %r64, %r35, 1; + or.b32 %r65, %r63, %r53; + xor.b32 %r66, %r65, %r64; + add.s32 %r13, %r51, %r66; + shl.b32 %r67, %r38, 10; + and.b32 %r68, %r67, -32768; + shl.b32 %r69, %r62, 4; + shl.b32 %r70, %r2, 2; + and.b32 %r71, %r70, 124; + shr.u32 %r72, %r2, 4; + and.b32 %r73, %r72, 2; + or.b32 %r74, %r69, %r71; + xor.b32 %r75, %r74, %r64; + add.s32 %r76, %r51, %r73; + add.s32 %r14, %r76, %r75; + shl.b32 %r77, %r58, 2; + and.b32 %r78, %r2, 124; + shr.u32 %r79, %r2, 6; + and.b32 %r80, %r79, 2; + or.b32 %r81, %r57, %r77; + xor.b32 %r82, %r81, %r78; + add.s32 %r83, %r51, %r80; + add.s32 %r15, %r83, %r82; + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + cvt.s64.s32 %rd22, %r47; + cvt.u64.u32 %rd23, %r79; + bfe.u64 %rd24, %rd23, 1, 1; + shl.b64 %rd25, %rd24, 2; + add.s64 %rd3, %rd16, %rd25; + add.s64 %rd4, %rd13, %rd25; + add.s32 %r84, %r68, %r5; + shl.b32 %r85, %r33, 7; + add.s32 %r86, %r84, %r85; + cvt.u32.u64 %r87, %rd24; + shl.b32 %r88, %r87, 1; + add.s32 %r89, %r86, %r88; + cvt.u64.u32 %rd5, %r89; + add.s32 %r90, %r5, %r6; + cvt.s64.s32 %rd26, %r90; + add.s64 %rd27, %rd26, %rd1; + shl.b64 %rd28, %rd27, 1; + add.s64 %rd6, %rd11, %rd28; + add.s64 %rd7, %rd10, %rd28; + shl.b64 %rd29, %rd1, 1; + add.s64 %rd8, %rd16, %rd29; + or.b64 %rd30, %rd22, %rd1; + shl.b64 %rd31, %rd30, 2; + add.s64 %rd64, %rd15, %rd31; + add.s64 %rd63, %rd14, %rd31; + add.s64 %rd9, %rd13, %rd29; + mov.b64 %rd66, -4; + mov.b64 %rd65, 0; + setp.ne.b32 %p5, %r11, 0; + setp.eq.b32 %p4, %r11, 0; +$L__BB0_3: // =>This Inner Loop Header: Depth=1 + .loc 1 61 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35 + add.s64 %rd60, %rd2, %rd66; + cvt.u32.u64 %r94, %rd60; + add.s32 %r95, %r94, 4; + mad.wide.s32 %rd33, %r95, 2, %rd12; + .loc 1 61 62 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62 + // begin inline asm + mov.u64 %rd32, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd32, 1.0; + // end inline asm + mov.b16 %rs5, 0; + mov.pred %p3, -1; + // begin inline asm + mov.u16 %rs4, %rs5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd33 + 0 ], %rd32; + // end inline asm + .loc 1 61 115 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115 + cvt.f32.bf16 %r96, %rs4; + .loc 1 62 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42 + add.s64 %rd35, %rd9, %rd65; + // begin inline asm + mov.u64 %rd34, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd34, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs6, %rs5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd35 + 0 ], %rd34; + // end inline asm + .loc 1 62 95 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95 + cvt.f32.bf16 %r97, %rs6; + .loc 1 63 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51 + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd36, 1.0; + // end inline asm + mov.b32 %r92, 0; + // begin inline asm + mov.u32 %r91, %r92; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b32 { %r91 }, [ %rd63 + 0 ], %rd36; + // end inline asm + .loc 1 64 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51 + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r93, %r92; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b32 { %r93 }, [ %rd64 + 0 ], %rd37; + // end inline asm + bar.sync 0; + st.shared.b32 [%r12], %r93; + bar.sync 0; + ld.shared.b32 %r98, [%r13]; + add.s32 %r99, %r94, 4100; + .loc 1 65 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35 + mad.wide.s32 %rd39, %r99, 2, %rd12; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + // begin inline asm + mov.u64 %rd38, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd38, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs7, %rs5; + @%p3 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs7 }, [ %rd39 + 0 ], %rd38; + // end inline asm + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r100, %rs7; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + add.s64 %rd41, %rd8, %rd65; + // begin inline asm + mov.u64 %rd40, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd40, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs8, %rs5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd41 + 0 ], %rd40; + // end inline asm + .loc 1 66 96 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96 + cvt.f32.bf16 %r101, %rs8; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + add.s64 %rd61, %rd5, %rd66; + cvt.u32.u64 %r102, %rd61; + add.s32 %r103, %r102, 5; + mad.wide.s32 %rd43, %r103, 2, %rd12; + .loc 1 72 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68 + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs9, %rs5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd43 + 0 ], %rd42; + // end inline asm + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r104, %rs9; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r105, %r8, %r104; + .loc 1 80 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35 + add.s64 %rd49, %rd4, %rd65; + .loc 1 80 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85 + add.s64 %rd45, %rd49, 2; + // begin inline asm + mov.u64 %rd44, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd44, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs10, %rs5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd45 + 0 ], %rd44; + // end inline asm + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r106, %rs10; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r107, %r105; + fma.rn.f32 %r108, %r107, %r106, 0f00000000; + add.s32 %r109, %r102, 4; + .loc 1 90 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35 + mad.wide.s32 %rd47, %r109, 2, %rd12; + .loc 1 90 64 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64 + // begin inline asm + mov.u64 %rd46, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd46, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd47 + 0 ], %rd46; + // end inline asm + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r110, %rs11; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r111, %r8, %r110; + .loc 1 98 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81 + // begin inline asm + mov.u64 %rd48, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd48, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, %rs5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd49 + 0 ], %rd48; + // end inline asm + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r112, %rs12; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r113, %r111, %r112; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r114, %r108, %r113, %p4; + .loc 1 111 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24 + mul.f32 %r115, %r7, %r96; + .loc 1 113 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24 + mul.f32 %r116, %r115, %r97; + .loc 1 116 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24 + mul.f32 %r117, %r116, %r91; + bar.sync 0; + st.shared.b32 [%r12], %r117; + bar.sync 0; + ld.shared.b32 %r118, [%r13]; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r119, %r98, %r114, %r118; + add.s32 %r120, %r102, 4101; + .loc 1 121 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35 + mad.wide.s32 %rd51, %r120, 2, %rd12; + .loc 1 121 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71 + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd51 + 0 ], %rd50; + // end inline asm + .loc 1 121 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132 + cvt.f32.bf16 %r121, %rs13; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + mul.f32 %r122, %r10, %r121; + .loc 1 127 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35 + add.s64 %rd57, %rd3, %rd65; + .loc 1 127 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85 + add.s64 %rd53, %rd57, 2; + // begin inline asm + mov.u64 %rd52, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd52, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs14, %rs5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd53 + 0 ], %rd52; + // end inline asm + .loc 1 127 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146 + cvt.f32.bf16 %r123, %rs14; + .loc 1 131 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17 + neg.f32 %r124, %r122; + fma.rn.f32 %r125, %r124, %r123, 0f00000000; + add.s32 %r126, %r102, 4100; + .loc 1 134 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35 + mad.wide.s32 %rd55, %r126, 2, %rd12; + .loc 1 134 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71 + // begin inline asm + mov.u64 %rd54, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd54, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd55 + 0 ], %rd54; + // end inline asm + .loc 1 134 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132 + cvt.f32.bf16 %r127, %rs15; + .loc 1 139 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24 + mul.f32 %r128, %r10, %r127; + .loc 1 140 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs16, %rs5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd57 + 0 ], %rd56; + // end inline asm + .loc 1 140 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142 + cvt.f32.bf16 %r129, %rs16; + .loc 1 142 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24 + mul.f32 %r130, %r128, %r129; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r131, %r125, %r130, %p4; + .loc 1 151 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25 + mul.f32 %r132, %r9, %r100; + .loc 1 153 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26 + mul.f32 %r133, %r132, %r101; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + mul.f32 %r134, %r133, %r91; + bar.sync 0; + st.shared.b32 [%r12], %r134; + bar.sync 0; + ld.shared.b32 %r135, [%r13]; + .loc 1 159 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26 + fma.rn.f32 %r136, %r98, %r131, %r135; + .loc 1 161 55 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55 + add.s64 %rd58, %rd7, %rd65; + cvt.rn.bf16.f32 %rs19, %r119; + bar.sync 0; + st.shared.b16 [%r14], %rs19; + bar.sync 0; + ld.shared.b16 %rs17, [%r15]; + // begin inline asm + @%p3 st.global.b16 [ %rd58 + 0 ], { %rs17 }; + // end inline asm + .loc 1 162 56 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56 + add.s64 %rd59, %rd6, %rd65; + cvt.rn.bf16.f32 %rs20, %r136; + bar.sync 0; + st.shared.b16 [%r14], %rs20; + bar.sync 0; + ld.shared.b16 %rs18, [%r15]; + // begin inline asm + @%p3 st.global.b16 [ %rd59 + 0 ], { %rs18 }; + // end inline asm + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + add.s64 %rd66, %rd66, 4; + add.s64 %rd65, %rd65, 8; + add.s64 %rd64, %rd64, 16; + add.s64 %rd63, %rd63, 16; + setp.lt.u64 %p6, %rd66, 124; + @%p6 bra $L__BB0_3; +// %bb.4: + .loc 1 53 4 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4 + ret; +$L__tmp10: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 456 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 118 +.b8 113 +.b8 104 +.b8 106 +.b8 116 +.b8 121 +.b8 103 +.b8 55 +.b8 102 +.b8 118 +.b8 120 +.b8 122 +.b8 119 +.b8 116 +.b8 98 +.b8 116 +.b8 116 +.b8 52 +.b8 118 +.b8 114 +.b8 100 +.b8 107 +.b8 98 +.b8 110 +.b8 98 +.b8 54 +.b8 110 +.b8 51 +.b8 50 +.b8 102 +.b8 110 +.b8 114 +.b8 105 +.b8 106 +.b8 106 +.b8 112 +.b8 108 +.b8 51 +.b8 118 +.b8 118 +.b8 52 +.b8 99 +.b8 102 +.b8 113 +.b8 100 +.b8 52 +.b8 109 +.b8 122 +.b8 110 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 98 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 101 +.b8 103 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 119 +.b8 105 +.b8 116 +.b8 104 +.b8 95 +.b8 115 +.b8 105 +.b8 122 +.b8 101 +.b8 115 +.b8 95 +.b8 115 +.b8 116 +.b8 97 +.b8 99 +.b8 107 +.b8 95 +.b8 117 +.b8 110 +.b8 98 +.b8 105 +.b8 110 +.b8 100 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x151:0x7a DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp2 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 4 // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp9 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp6 // DW_AT_low_pc +.b64 $L__tmp9 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..37e71ceadfc771e88cdc1d056801274caec02071 --- /dev/null +++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source @@ -0,0 +1,972 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc215 = loc(unknown) +#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc222 = loc("in_out_ptr0"(#loc)) +#loc223 = loc("in_out_ptr1"(#loc)) +#loc224 = loc("in_ptr0"(#loc)) +#loc225 = loc("in_ptr1"(#loc)) +#loc226 = loc("in_ptr2"(#loc)) +#loc227 = loc("in_ptr3"(#loc)) +#loc228 = loc("in_ptr4"(#loc)) +#loc229 = loc("xnumel"(#loc)) +#loc230 = loc("r0_numel"(#loc)) +#loc432 = loc("input"(#loc213)) +#loc433 = loc("a"(#loc218)) +#loc434 = loc("b"(#loc218)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 73728 : i32 loc(#loc231) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc232) + %xoffset = tt.get_program_id x : i32 loc(#loc233) + %xoffset_2 = arith.constant 64 : i32 loc(#loc234) + %xoffset_3 = arith.constant 64 : i32 loc(#loc234) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc235) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc236) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc237) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc237) + %xmask = arith.constant true loc(#loc238) + %xmask_8 = arith.constant dense : tensor<64x4xi1> loc(#loc238) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc239) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc240) + %x0 = arith.constant 32 : i32 loc(#loc241) + %x0_10 = arith.constant 32 : i32 loc(#loc241) + %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc241) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc241) + %x1 = arith.constant 32 : i32 loc(#loc242) + %x1_13 = arith.constant 32 : i32 loc(#loc242) + %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc242) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc242) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc243) + %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244) + %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc244) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c4_i32 = arith.constant 4 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<64x4xf32>, tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc246) + %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc246) + %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc247) + %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x4xi32> loc(#loc247) + %tmp0 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_27 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_28 = arith.constant dense<4096> : tensor<1x4xi32> loc(#loc248) + %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x4xi32> loc(#loc248) + %tmp0_30 = arith.constant 128 : i32 loc(#loc249) + %tmp0_31 = arith.constant 128 : i32 loc(#loc249) + %tmp0_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc249) + %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<64x1xi32> loc(#loc249) + %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc250) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc250) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<64x4xi32> loc(#loc250) + %tmp0_37 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_38 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc251) + %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<64x1xi32> loc(#loc251) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc252) + %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<64x4xi32> loc(#loc252) + %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc253) + %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc253) + %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254) + %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc254) + %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc254) + %tmp0_48 = arith.truncf %tmp0_47 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc254) + %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc254) + %tmp0_50 = arith.extf %tmp0_49 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc255) + %tmp6 = arith.constant 128 : i32 loc(#loc256) + %tmp6_51 = arith.constant 128 : i32 loc(#loc256) + %tmp6_52 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc256) + %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<64x1xi32> loc(#loc256) + %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc257) + %tmp6_55 = tt.broadcast %tmp6_53 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc257) + %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<64x4xi32> loc(#loc257) + %tmp6_57 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_58 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_59 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc258) + %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<64x1xi32> loc(#loc258) + %tmp6_61 = tt.broadcast %tmp6_60 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc259) + %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<64x4xi32> loc(#loc259) + %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc260) + %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc260) + %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261) + %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc261) + %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc261) + %tmp6_68 = arith.truncf %tmp6_67 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc261) + %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc261) + %tmp6_70 = arith.extf %tmp6_69 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc262) + %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<64x4xf32> loc(#loc263) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x4xf32> loc(#loc264) + %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc265) + %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc265) + %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<64x4xf32> loc(#loc266) + %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<64x4xf32> loc(#loc267) + %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc268) + %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc268) + scf.yield %_tmp4_72, %_tmp10_74 : tensor<64x4xf32>, tensor<64x4xf32> loc(#loc39) + } loc(#loc435) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc269) + %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc270) + %tmp10 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc271) + %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc272) + %c0_i32_21 = arith.constant 0 : i32 loc(#loc44) + %c4_i32_22 = arith.constant 4 : i32 loc(#loc44) + %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44) + %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44) + %6 = arith.bitcast %c4_i32_22 : i32 to i32 loc(#loc44) + %7 = ub.poison : i32 loc(#loc44) + scf.for %r0_offset = %4 to %5 step %6 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc273) + %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc273) + %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc274) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x4xi32> loc(#loc274) + %r0_3 = arith.constant 2 : i32 loc(#loc275) + %r0_3_25 = arith.constant 2 : i32 loc(#loc275) + %r0_3_26 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc275) + %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x4xi32> loc(#loc275) + %r0_4 = arith.constant 2 : i32 loc(#loc276) + %r0_4_28 = arith.constant 2 : i32 loc(#loc276) + %r0_4_29 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc276) + %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x4xi32> loc(#loc276) + %tmp50 = arith.constant 128 : i32 loc(#loc277) + %tmp50_31 = arith.constant 128 : i32 loc(#loc277) + %tmp50_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc277) + %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<64x1xi32> loc(#loc277) + %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc278) + %tmp50_35 = tt.broadcast %tmp50_33 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc278) + %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<64x4xi32> loc(#loc278) + %tmp50_37 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_38 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc279) + %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<64x1xi32> loc(#loc279) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc280) + %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<64x4xi32> loc(#loc280) + %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc281) + %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc281) + %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282) + %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc282) + %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc282) + %tmp50_48 = arith.truncf %tmp50_47 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc282) + %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc282) + %tmp50_50 = arith.extf %tmp50_49 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc283) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4x!tt.ptr> loc(#loc284) + %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x4x!tt.ptr>, tensor<1x4xi32> loc(#loc284) + %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285) + %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x4xf32> loc(#loc285) + %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x4xf32> to tensor<1x4xbf16> loc(#loc285) + %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x4x!tt.ptr> loc(#loc285) + %tmp58_56 = arith.extf %tmp58_55 : tensor<1x4xbf16> to tensor<1x4xf32> loc(#loc286) + %tmp63 = arith.constant 128 : i32 loc(#loc287) + %tmp63_57 = arith.constant 128 : i32 loc(#loc287) + %tmp63_58 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc287) + %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<64x1xi32> loc(#loc287) + %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc288) + %tmp63_61 = tt.broadcast %tmp63_59 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc288) + %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<64x4xi32> loc(#loc288) + %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc289) + %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc289) + %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290) + %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc290) + %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc290) + %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc290) + %tmp66 = arith.constant 128 : i32 loc(#loc291) + %tmp66_69 = arith.constant 128 : i32 loc(#loc291) + %tmp66_70 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc291) + %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<64x1xi32> loc(#loc291) + %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc292) + %tmp66_73 = tt.broadcast %tmp66_71 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc292) + %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<64x4xi32> loc(#loc292) + %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc293) + %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc293) + %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294) + %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc294) + %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc294) + %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc294) + %tmp96 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_81 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_82 = arith.constant dense<4096> : tensor<1x4xi32> loc(#loc295) + %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x4xi32> loc(#loc295) + %tmp96_84 = arith.constant 128 : i32 loc(#loc296) + %tmp96_85 = arith.constant 128 : i32 loc(#loc296) + %tmp96_86 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc296) + %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<64x1xi32> loc(#loc296) + %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc297) + %tmp96_89 = tt.broadcast %tmp96_87 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc297) + %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<64x4xi32> loc(#loc297) + %tmp96_91 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_92 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_93 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc298) + %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<64x1xi32> loc(#loc298) + %tmp96_95 = tt.broadcast %tmp96_94 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc299) + %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<64x4xi32> loc(#loc299) + %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc300) + %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc300) + %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301) + %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc301) + %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc301) + %tmp96_102 = arith.truncf %tmp96_101 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc301) + %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc301) + %tmp96_104 = arith.extf %tmp96_103 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc302) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x4x!tt.ptr> loc(#loc303) + %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x4x!tt.ptr>, tensor<1x4xi32> loc(#loc303) + %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304) + %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x4xf32> loc(#loc304) + %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x4xf32> to tensor<1x4xbf16> loc(#loc304) + %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x4x!tt.ptr> loc(#loc304) + %tmp102_110 = arith.extf %tmp102_109 : tensor<1x4xbf16> to tensor<1x4xf32> loc(#loc305) + %tmp13 = arith.constant 0 : i64 loc(#loc306) + %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306) + %tmp14 = arith.extsi %r0_3_27 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc307) + %tmp14_112 = arith.constant dense<0> : tensor<1x4xi64> loc(#loc307) + %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x4xi64> loc(#loc307) + %tmp15 = arith.constant 1 : i64 loc(#loc308) + %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308) + %tmp16 = arith.extsi %r0_3_27 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc309) + %tmp16_115 = arith.constant dense<1> : tensor<1x4xi64> loc(#loc309) + %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x4xi64> loc(#loc309) + %tmp17 = arith.constant 2 : i32 loc(#loc310) + %tmp17_117 = arith.constant 2 : i32 loc(#loc310) + %tmp17_118 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc310) + %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x4xi32> loc(#loc310) + %tmp17_120 = arith.constant 1 : i32 loc(#loc311) + %tmp17_121 = arith.constant 1 : i32 loc(#loc311) + %tmp17_122 = arith.constant dense<1> : tensor<1x4xi32> loc(#loc311) + %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x4xi32> loc(#loc311) + %tmp17_124 = arith.constant 128 : i32 loc(#loc312) + %tmp17_125 = arith.constant 128 : i32 loc(#loc312) + %tmp17_126 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc312) + %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<64x1xi32> loc(#loc312) + %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc313) + %tmp17_129 = tt.broadcast %tmp17_127 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc313) + %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<64x4xi32> loc(#loc313) + %tmp17_131 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_132 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_133 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc314) + %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<64x1xi32> loc(#loc314) + %tmp17_135 = tt.broadcast %tmp17_134 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc315) + %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<64x4xi32> loc(#loc315) + %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc316) + %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc316) + %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x4xi1> loc(#loc317) + %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318) + %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc318) + %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc318) + %tmp17_143 = arith.truncf %tmp17_142 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc318) + %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc318) + %tmp17_145 = arith.extf %tmp17_144 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc319) + %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320) + %tmp20 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc321) + %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<64x1xf32> loc(#loc321) + %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322) + %tmp22 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc323) + %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<64x1xf32> loc(#loc323) + %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc324) + %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc325) + %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<64x4xf32> loc(#loc325) + %tmp25 = arith.constant 2 : i32 loc(#loc326) + %tmp25_149 = arith.constant 2 : i32 loc(#loc326) + %tmp25_150 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc326) + %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x4xi32> loc(#loc326) + %tmp25_152 = arith.constant 1 : i32 loc(#loc327) + %tmp25_153 = arith.constant 1 : i32 loc(#loc327) + %tmp25_154 = arith.constant dense<1> : tensor<1x4xi32> loc(#loc327) + %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x4xi32> loc(#loc327) + %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc328) + %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc329) + %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc329) + %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x4xi1> loc(#loc330) + %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331) + %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc331) + %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc331) + %tmp25_163 = arith.truncf %tmp25_162 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc331) + %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc331) + %tmp25_165 = arith.extf %tmp25_164 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc332) + %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<64x4xf32> loc(#loc333) + %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334) + %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc334) + %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<64x4xf32> loc(#loc334) + %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335) + %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc335) + %tmp31 = tt.broadcast %tmp16_116 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc336) + %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc336) + %tmp32 = arith.extsi %r0_3_27 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc337) + %tmp32_170 = arith.constant dense<1> : tensor<1x4xi64> loc(#loc337) + %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x4xi64> loc(#loc337) + %tmp33 = arith.constant 2 : i64 loc(#loc338) + %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338) + %tmp34 = arith.extsi %r0_3_27 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc339) + %tmp34_173 = arith.constant dense<2> : tensor<1x4xi64> loc(#loc339) + %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x4xi64> loc(#loc339) + %tmp35 = arith.constant 2 : i32 loc(#loc340) + %tmp35_175 = arith.constant 2 : i32 loc(#loc340) + %tmp35_176 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc340) + %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x4xi32> loc(#loc340) + %tmp35_178 = arith.constant 128 : i32 loc(#loc341) + %tmp35_179 = arith.constant 128 : i32 loc(#loc341) + %tmp35_180 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc341) + %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<64x1xi32> loc(#loc341) + %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc342) + %tmp35_183 = tt.broadcast %tmp35_181 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc342) + %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<64x4xi32> loc(#loc342) + %tmp35_185 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_186 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_187 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc343) + %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<64x1xi32> loc(#loc343) + %tmp35_189 = tt.broadcast %tmp35_188 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc344) + %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<64x4xi32> loc(#loc344) + %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc345) + %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc345) + %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x4xi1> loc(#loc346) + %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347) + %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc347) + %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc347) + %tmp35_197 = arith.truncf %tmp35_196 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc347) + %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc347) + %tmp35_199 = arith.extf %tmp35_198 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc348) + %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349) + %tmp38 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc350) + %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<64x1xf32> loc(#loc350) + %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351) + %tmp40 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc352) + %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<64x1xf32> loc(#loc352) + %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc353) + %tmp42 = tt.broadcast %tmp41 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc354) + %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<64x4xf32> loc(#loc354) + %tmp43 = arith.constant 2 : i32 loc(#loc355) + %tmp43_203 = arith.constant 2 : i32 loc(#loc355) + %tmp43_204 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc355) + %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x4xi32> loc(#loc355) + %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc356) + %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc357) + %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc357) + %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x4xi1> loc(#loc358) + %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359) + %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc359) + %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc359) + %tmp43_213 = arith.truncf %tmp43_212 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc359) + %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc359) + %tmp43_215 = arith.extf %tmp43_214 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc360) + %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<64x4xf32> loc(#loc361) + %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362) + %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc362) + %tmp48 = tt.broadcast %tmp32_171 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc363) + %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc363) + %tmp49 = tt.broadcast %tmp16_116 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc364) + %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc364) + %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365) + %tmp53 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc366) + %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<64x1xf32> loc(#loc366) + %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367) + %tmp55 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc368) + %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<64x1xf32> loc(#loc368) + %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc369) + %tmp57 = tt.broadcast %tmp56 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc370) + %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<64x4xf32> loc(#loc370) + %tmp60 = tt.broadcast %tmp58_56 : tensor<1x4xf32> -> tensor<64x4xf32> loc(#loc371) + %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<64x4xf32> loc(#loc371) + %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<64x4xf32> loc(#loc372) + %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<64x4xf32> loc(#loc373) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x4xf32> loc(#loc374) + %tmp70 = arith.constant 2 : i32 loc(#loc375) + %tmp70_223 = arith.constant 2 : i32 loc(#loc375) + %tmp70_224 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc375) + %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x4xi32> loc(#loc375) + %tmp70_226 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_227 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_228 = arith.constant dense<4097> : tensor<1x4xi32> loc(#loc376) + %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x4xi32> loc(#loc376) + %tmp70_230 = arith.constant 128 : i32 loc(#loc377) + %tmp70_231 = arith.constant 128 : i32 loc(#loc377) + %tmp70_232 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc377) + %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<64x1xi32> loc(#loc377) + %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc378) + %tmp70_235 = tt.broadcast %tmp70_233 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc378) + %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<64x4xi32> loc(#loc378) + %tmp70_237 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_238 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_239 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc379) + %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<64x1xi32> loc(#loc379) + %tmp70_241 = tt.broadcast %tmp70_240 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc380) + %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<64x4xi32> loc(#loc380) + %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc381) + %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc381) + %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x4xi1> loc(#loc382) + %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383) + %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc383) + %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc383) + %tmp70_249 = arith.truncf %tmp70_248 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc383) + %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc383) + %tmp70_251 = arith.extf %tmp70_250 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc384) + %tmp72 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc385) + %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<64x1xf32> loc(#loc385) + %tmp73 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc386) + %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<64x1xf32> loc(#loc386) + %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc387) + %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc388) + %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<64x4xf32> loc(#loc388) + %tmp76 = arith.constant 2 : i32 loc(#loc389) + %tmp76_255 = arith.constant 2 : i32 loc(#loc389) + %tmp76_256 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc389) + %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x4xi32> loc(#loc389) + %tmp76_258 = arith.constant 1 : i32 loc(#loc390) + %tmp76_259 = arith.constant 1 : i32 loc(#loc390) + %tmp76_260 = arith.constant dense<1> : tensor<1x4xi32> loc(#loc390) + %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x4xi32> loc(#loc390) + %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc391) + %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc392) + %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc392) + %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x4xi1> loc(#loc393) + %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394) + %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc394) + %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc394) + %tmp76_269 = arith.truncf %tmp76_268 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc394) + %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc394) + %tmp76_271 = arith.extf %tmp76_270 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc395) + %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<64x4xf32> loc(#loc396) + %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397) + %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc397) + %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<64x4xf32> loc(#loc397) + %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398) + %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc398) + %tmp82 = tt.broadcast %tmp16_116 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc399) + %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc399) + %tmp83 = arith.constant 2 : i32 loc(#loc400) + %tmp83_276 = arith.constant 2 : i32 loc(#loc400) + %tmp83_277 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc400) + %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x4xi32> loc(#loc400) + %tmp83_279 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_280 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_281 = arith.constant dense<4096> : tensor<1x4xi32> loc(#loc401) + %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x4xi32> loc(#loc401) + %tmp83_283 = arith.constant 128 : i32 loc(#loc402) + %tmp83_284 = arith.constant 128 : i32 loc(#loc402) + %tmp83_285 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc402) + %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<64x1xi32> loc(#loc402) + %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc403) + %tmp83_288 = tt.broadcast %tmp83_286 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc403) + %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<64x4xi32> loc(#loc403) + %tmp83_290 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_291 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_292 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc404) + %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<64x1xi32> loc(#loc404) + %tmp83_294 = tt.broadcast %tmp83_293 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc405) + %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<64x4xi32> loc(#loc405) + %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc406) + %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc406) + %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x4xi1> loc(#loc407) + %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408) + %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc408) + %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc408) + %tmp83_302 = arith.truncf %tmp83_301 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc408) + %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc408) + %tmp83_304 = arith.extf %tmp83_303 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc409) + %tmp85 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc410) + %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<64x1xf32> loc(#loc410) + %tmp86 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc411) + %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<64x1xf32> loc(#loc411) + %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc412) + %tmp88 = tt.broadcast %tmp87 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc413) + %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<64x4xf32> loc(#loc413) + %tmp89 = arith.constant 2 : i32 loc(#loc414) + %tmp89_308 = arith.constant 2 : i32 loc(#loc414) + %tmp89_309 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc414) + %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x4xi32> loc(#loc414) + %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc415) + %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc416) + %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc416) + %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x4xi1> loc(#loc417) + %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418) + %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc418) + %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc418) + %tmp89_318 = arith.truncf %tmp89_317 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc418) + %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc418) + %tmp89_320 = arith.extf %tmp89_319 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc419) + %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<64x4xf32> loc(#loc420) + %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421) + %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc421) + %tmp94 = tt.broadcast %tmp32_171 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc422) + %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc422) + %tmp95 = tt.broadcast %tmp16_116 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc423) + %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc423) + %tmp98 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc424) + %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<64x1xf32> loc(#loc424) + %tmp99 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc425) + %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<64x1xf32> loc(#loc425) + %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc426) + %tmp101 = tt.broadcast %tmp100 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc427) + %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<64x4xf32> loc(#loc427) + %tmp104 = tt.broadcast %tmp102_110 : tensor<1x4xf32> -> tensor<64x4xf32> loc(#loc428) + %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<64x4xf32> loc(#loc428) + %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<64x4xf32> loc(#loc429) + %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<64x4xf32> loc(#loc430) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x4xf32> loc(#loc431) + %c128_i32 = arith.constant 128 : i32 loc(#loc204) + %c128_i32_328 = arith.constant 128 : i32 loc(#loc204) + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc204) + %8 = arith.muli %cst, %xindex_7 : tensor<64x1xi32> loc(#loc204) + %9 = tt.broadcast %r0_index_23 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc205) + %10 = tt.broadcast %8 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc205) + %11 = arith.addi %9, %10 : tensor<64x4xi32> loc(#loc205) + %12 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc206) + %13 = tt.addptr %12, %11 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc206) + %14 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc207) + %15 = arith.truncf %tmp68 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc207) + tt.store %13, %15, %14 : tensor<64x4x!tt.ptr> loc(#loc207) + %c128_i32_329 = arith.constant 128 : i32 loc(#loc208) + %c128_i32_330 = arith.constant 128 : i32 loc(#loc208) + %cst_331 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc208) + %16 = arith.muli %cst_331, %xindex_7 : tensor<64x1xi32> loc(#loc208) + %17 = tt.broadcast %r0_index_23 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc209) + %18 = tt.broadcast %16 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc209) + %19 = arith.addi %17, %18 : tensor<64x4xi32> loc(#loc209) + %20 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc210) + %21 = tt.addptr %20, %19 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc210) + %22 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc211) + %23 = arith.truncf %tmp110 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc211) + tt.store %21, %23, %22 : tensor<64x4x!tt.ptr> loc(#loc211) + } loc(#loc44) + tt.return loc(#loc212) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x4xf32> loc("input"(#loc213))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214) + tt.reduce.return %2 : f32 loc(#loc214) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc214) + tt.return %0 : tensor<64xf32> loc(#loc216) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc217) + tt.return %1 : tensor<64xf32> loc(#loc217) + } loc(#loc213) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc219) + tt.return %0 : f32 loc(#loc220) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc221) + tt.return %1 : f32 loc(#loc221) + } loc(#loc218) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55) +#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66) +#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81) +#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57) +#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55) +#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63) +#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95) +#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42) +#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44) +#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55) +#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66) +#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81) +#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24) +#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24) +#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32) +#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53) +#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59) +#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91) +#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42) +#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24) +#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24) +#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33) +#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43) +#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39) +#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc231 = loc("xnumel"(#loc1)) +#loc232 = loc("r0_numel"(#loc2)) +#loc233 = loc("xoffset"(#loc3)) +#loc234 = loc("xoffset"(#loc4)) +#loc235 = loc("xindex"(#loc5)) +#loc236 = loc("xindex"(#loc6)) +#loc237 = loc("xindex"(#loc7)) +#loc238 = loc("xmask"(#loc8)) +#loc239 = loc("r0_base"(#loc9)) +#loc240 = loc("r0_base"(#loc10)) +#loc241 = loc("x0"(#loc11)) +#loc242 = loc("x1"(#loc12)) +#loc243 = loc("_tmp4"(#loc13)) +#loc244 = loc("_tmp10"(#loc14)) +#loc245 = loc("_tmp4"(#loc15)) +#loc246 = loc("r0_index"(#loc16)) +#loc247 = loc("r0_mask"(#loc17)) +#loc248 = loc("tmp0"(#loc18)) +#loc249 = loc("tmp0"(#loc19)) +#loc250 = loc("tmp0"(#loc20)) +#loc251 = loc("tmp0"(#loc21)) +#loc252 = loc("tmp0"(#loc22)) +#loc253 = loc("tmp0"(#loc23)) +#loc254 = loc("tmp0"(#loc24)) +#loc255 = loc("tmp0"(#loc25)) +#loc256 = loc("tmp6"(#loc26)) +#loc257 = loc("tmp6"(#loc27)) +#loc258 = loc("tmp6"(#loc28)) +#loc259 = loc("tmp6"(#loc29)) +#loc260 = loc("tmp6"(#loc30)) +#loc261 = loc("tmp6"(#loc31)) +#loc262 = loc("tmp6"(#loc32)) +#loc263 = loc("tmp2"(#loc33)) +#loc264 = loc("tmp5"(#loc34)) +#loc265 = loc("_tmp4"(#loc35)) +#loc266 = loc("tmp8"(#loc36)) +#loc267 = loc("tmp11"(#loc37)) +#loc268 = loc("_tmp10"(#loc38)) +#loc269 = loc("tmp4"(#loc40)) +#loc270 = loc("tmp4"(#loc41)) +#loc271 = loc("tmp10"(#loc42)) +#loc272 = loc("tmp10"(#loc43)) +#loc273 = loc("r0_index"(#loc45)) +#loc274 = loc("r0_mask"(#loc46)) +#loc275 = loc("r0_3"(#loc47)) +#loc276 = loc("r0_4"(#loc48)) +#loc277 = loc("tmp50"(#loc49)) +#loc278 = loc("tmp50"(#loc50)) +#loc279 = loc("tmp50"(#loc51)) +#loc280 = loc("tmp50"(#loc52)) +#loc281 = loc("tmp50"(#loc53)) +#loc282 = loc("tmp50"(#loc54)) +#loc283 = loc("tmp50"(#loc55)) +#loc284 = loc("tmp58"(#loc56)) +#loc285 = loc("tmp58"(#loc57)) +#loc286 = loc("tmp58"(#loc58)) +#loc287 = loc("tmp63"(#loc59)) +#loc288 = loc("tmp63"(#loc60)) +#loc289 = loc("tmp63"(#loc61)) +#loc290 = loc("tmp63"(#loc62)) +#loc291 = loc("tmp66"(#loc63)) +#loc292 = loc("tmp66"(#loc64)) +#loc293 = loc("tmp66"(#loc65)) +#loc294 = loc("tmp66"(#loc66)) +#loc295 = loc("tmp96"(#loc67)) +#loc296 = loc("tmp96"(#loc68)) +#loc297 = loc("tmp96"(#loc69)) +#loc298 = loc("tmp96"(#loc70)) +#loc299 = loc("tmp96"(#loc71)) +#loc300 = loc("tmp96"(#loc72)) +#loc301 = loc("tmp96"(#loc73)) +#loc302 = loc("tmp96"(#loc74)) +#loc303 = loc("tmp102"(#loc75)) +#loc304 = loc("tmp102"(#loc76)) +#loc305 = loc("tmp102"(#loc77)) +#loc306 = loc("tmp13"(#loc78)) +#loc307 = loc("tmp14"(#loc79)) +#loc308 = loc("tmp15"(#loc80)) +#loc309 = loc("tmp16"(#loc81)) +#loc310 = loc("tmp17"(#loc82)) +#loc311 = loc("tmp17"(#loc83)) +#loc312 = loc("tmp17"(#loc84)) +#loc313 = loc("tmp17"(#loc85)) +#loc314 = loc("tmp17"(#loc86)) +#loc315 = loc("tmp17"(#loc87)) +#loc316 = loc("tmp17"(#loc88)) +#loc317 = loc("tmp17"(#loc89)) +#loc318 = loc("tmp17"(#loc90)) +#loc319 = loc("tmp17"(#loc91)) +#loc320 = loc("tmp19"(#loc92)) +#loc321 = loc("tmp20"(#loc93)) +#loc322 = loc("tmp21"(#loc94)) +#loc323 = loc("tmp22"(#loc95)) +#loc324 = loc("tmp23"(#loc96)) +#loc325 = loc("tmp24"(#loc97)) +#loc326 = loc("tmp25"(#loc98)) +#loc327 = loc("tmp25"(#loc99)) +#loc328 = loc("tmp25"(#loc100)) +#loc329 = loc("tmp25"(#loc101)) +#loc330 = loc("tmp25"(#loc102)) +#loc331 = loc("tmp25"(#loc103)) +#loc332 = loc("tmp25"(#loc104)) +#loc333 = loc("tmp27"(#loc105)) +#loc334 = loc("tmp29"(#loc106)) +#loc335 = loc("tmp30"(#loc107)) +#loc336 = loc("tmp31"(#loc108)) +#loc337 = loc("tmp32"(#loc109)) +#loc338 = loc("tmp33"(#loc110)) +#loc339 = loc("tmp34"(#loc111)) +#loc340 = loc("tmp35"(#loc112)) +#loc341 = loc("tmp35"(#loc113)) +#loc342 = loc("tmp35"(#loc114)) +#loc343 = loc("tmp35"(#loc115)) +#loc344 = loc("tmp35"(#loc116)) +#loc345 = loc("tmp35"(#loc117)) +#loc346 = loc("tmp35"(#loc118)) +#loc347 = loc("tmp35"(#loc119)) +#loc348 = loc("tmp35"(#loc120)) +#loc349 = loc("tmp37"(#loc121)) +#loc350 = loc("tmp38"(#loc122)) +#loc351 = loc("tmp39"(#loc123)) +#loc352 = loc("tmp40"(#loc124)) +#loc353 = loc("tmp41"(#loc125)) +#loc354 = loc("tmp42"(#loc126)) +#loc355 = loc("tmp43"(#loc127)) +#loc356 = loc("tmp43"(#loc128)) +#loc357 = loc("tmp43"(#loc129)) +#loc358 = loc("tmp43"(#loc130)) +#loc359 = loc("tmp43"(#loc131)) +#loc360 = loc("tmp43"(#loc132)) +#loc361 = loc("tmp45"(#loc133)) +#loc362 = loc("tmp47"(#loc134)) +#loc363 = loc("tmp48"(#loc135)) +#loc364 = loc("tmp49"(#loc136)) +#loc365 = loc("tmp52"(#loc137)) +#loc366 = loc("tmp53"(#loc138)) +#loc367 = loc("tmp54"(#loc139)) +#loc368 = loc("tmp55"(#loc140)) +#loc369 = loc("tmp56"(#loc141)) +#loc370 = loc("tmp57"(#loc142)) +#loc371 = loc("tmp60"(#loc143)) +#loc372 = loc("tmp64"(#loc144)) +#loc373 = loc("tmp67"(#loc145)) +#loc374 = loc("tmp68"(#loc146)) +#loc375 = loc("tmp70"(#loc147)) +#loc376 = loc("tmp70"(#loc148)) +#loc377 = loc("tmp70"(#loc149)) +#loc378 = loc("tmp70"(#loc150)) +#loc379 = loc("tmp70"(#loc151)) +#loc380 = loc("tmp70"(#loc152)) +#loc381 = loc("tmp70"(#loc153)) +#loc382 = loc("tmp70"(#loc154)) +#loc383 = loc("tmp70"(#loc155)) +#loc384 = loc("tmp70"(#loc156)) +#loc385 = loc("tmp72"(#loc157)) +#loc386 = loc("tmp73"(#loc158)) +#loc387 = loc("tmp74"(#loc159)) +#loc388 = loc("tmp75"(#loc160)) +#loc389 = loc("tmp76"(#loc161)) +#loc390 = loc("tmp76"(#loc162)) +#loc391 = loc("tmp76"(#loc163)) +#loc392 = loc("tmp76"(#loc164)) +#loc393 = loc("tmp76"(#loc165)) +#loc394 = loc("tmp76"(#loc166)) +#loc395 = loc("tmp76"(#loc167)) +#loc396 = loc("tmp78"(#loc168)) +#loc397 = loc("tmp80"(#loc169)) +#loc398 = loc("tmp81"(#loc170)) +#loc399 = loc("tmp82"(#loc171)) +#loc400 = loc("tmp83"(#loc172)) +#loc401 = loc("tmp83"(#loc173)) +#loc402 = loc("tmp83"(#loc174)) +#loc403 = loc("tmp83"(#loc175)) +#loc404 = loc("tmp83"(#loc176)) +#loc405 = loc("tmp83"(#loc177)) +#loc406 = loc("tmp83"(#loc178)) +#loc407 = loc("tmp83"(#loc179)) +#loc408 = loc("tmp83"(#loc180)) +#loc409 = loc("tmp83"(#loc181)) +#loc410 = loc("tmp85"(#loc182)) +#loc411 = loc("tmp86"(#loc183)) +#loc412 = loc("tmp87"(#loc184)) +#loc413 = loc("tmp88"(#loc185)) +#loc414 = loc("tmp89"(#loc186)) +#loc415 = loc("tmp89"(#loc187)) +#loc416 = loc("tmp89"(#loc188)) +#loc417 = loc("tmp89"(#loc189)) +#loc418 = loc("tmp89"(#loc190)) +#loc419 = loc("tmp89"(#loc191)) +#loc420 = loc("tmp91"(#loc192)) +#loc421 = loc("tmp93"(#loc193)) +#loc422 = loc("tmp94"(#loc194)) +#loc423 = loc("tmp95"(#loc195)) +#loc424 = loc("tmp98"(#loc196)) +#loc425 = loc("tmp99"(#loc197)) +#loc426 = loc("tmp100"(#loc198)) +#loc427 = loc("tmp101"(#loc199)) +#loc428 = loc("tmp104"(#loc200)) +#loc429 = loc("tmp107"(#loc201)) +#loc430 = loc("tmp109"(#loc202)) +#loc431 = loc("tmp110"(#loc203)) +#loc435 = loc("_tmp10"(#loc245)) diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f19f7ff2cbbf93184ad759700860ccc97d900c65 --- /dev/null +++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir @@ -0,0 +1,547 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc147 = loc("in_out_ptr0"(#loc)) +#loc148 = loc("in_out_ptr1"(#loc)) +#loc149 = loc("in_ptr0"(#loc)) +#loc150 = loc("in_ptr1"(#loc)) +#loc151 = loc("in_ptr2"(#loc)) +#loc152 = loc("in_ptr3"(#loc)) +#loc153 = loc("in_ptr4"(#loc)) +#loc154 = loc("xnumel"(#loc)) +#loc155 = loc("r0_numel"(#loc)) +#loc185 = loc("tmp4"(#loc33)) +#loc187 = loc("tmp10"(#loc36)) +#loc292 = loc(callsite(#loc1 at #loc185)) +#loc294 = loc(callsite(#loc1 at #loc187)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4097> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x4xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<36864> : tensor<64x1xi32, #blocked1> loc(#loc1) + %cst_4 = arith.constant dense<36864> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<4096> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_9 = arith.constant dense<32> : tensor<64x1xi32, #blocked1> loc(#loc1) + %cst_10 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked1> loc(#loc1) + %cst_13 = arith.constant dense<128> : tensor<1x4xi32, #blocked1> loc(#loc1) + %cst_14 = arith.constant dense<4096> : tensor<1x4xi32, #blocked1> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<1x4xbf16, #blocked1> loc(#loc1) + %cst_16 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked1> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_17 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32, #blocked1> loc(#loc1) + %cst_18 = arith.constant dense<1.280000e+02> : tensor<64x1xf32, #blocked1> loc(#loc1) + %cst_19 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc156) + %xoffset_20 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc157) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158) + %xindex_21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158) + %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc158) + %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc158) + %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked> loc(#loc159) + %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc159) + %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<64x1xi32, #blocked> loc(#loc159) + %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<64x1xi32, #blocked1> loc(#loc159) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160) + %r0_base_28 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160) + %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc160) + %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x4xi32, #blocked1> loc(#loc160) + %x0 = arith.remsi %xindex_26, %cst_10 : tensor<64x1xi32, #blocked> loc(#loc161) + %x0_31 = arith.remsi %xindex_27, %cst_9 : tensor<64x1xi32, #blocked1> loc(#loc161) + %x1 = arith.divsi %xindex_26, %cst_10 : tensor<64x1xi32, #blocked> loc(#loc162) + %x1_32 = arith.divsi %xindex_27, %cst_9 : tensor<64x1xi32, #blocked1> loc(#loc162) + %tmp0 = arith.muli %x0_31, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc163) + %tmp0_33 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc164) + %tmp0_34 = arith.muli %x1_32, %cst_3 : tensor<64x1xi32, #blocked1> loc(#loc165) + %tmp0_35 = tt.broadcast %tmp0_34 : tensor<64x1xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc166) + %tmp0_36 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked1> loc(#loc167) + %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4 = %cst_12, %_tmp10_51 = %cst_12) -> (tensor<64x4xf32, #blocked1>, tensor<64x4xf32, #blocked1>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked1> loc(#loc169) + %r0_index_52 = arith.addi %r0_index, %r0_base_30 : tensor<1x4xi32, #blocked1> loc(#loc169) + %r0_mask = arith.cmpi slt, %r0_index_52, %cst_13 : tensor<1x4xi32, #blocked1> loc(#loc170) + %tmp0_53 = arith.addi %r0_index_52, %cst_14 : tensor<1x4xi32, #blocked1> loc(#loc171) + %tmp0_54 = tt.broadcast %tmp0_53 : tensor<1x4xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc164) + %tmp0_55 = arith.addi %tmp0_54, %tmp0_33 : tensor<64x4xi32, #blocked1> loc(#loc164) + %tmp0_56 = arith.addi %tmp0_55, %tmp0_35 : tensor<64x4xi32, #blocked1> loc(#loc166) + %tmp0_57 = tt.addptr %tmp0_36, %tmp0_56 : tensor<64x4x!tt.ptr, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc167) + %tmp0_58 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked1> -> tensor<64x4xi1, #blocked1> loc(#loc172) + %tmp0_59 = tt.load %tmp0_57, %tmp0_58, %cst_16 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked1> loc(#loc172) + %tmp0_60 = arith.extf %tmp0_59 : tensor<64x4xbf16, #blocked1> to tensor<64x4xf32, #blocked1> loc(#loc173) + %tmp6 = tt.broadcast %r0_index_52 : tensor<1x4xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc174) + %tmp6_61 = arith.addi %tmp6, %tmp0_33 : tensor<64x4xi32, #blocked1> loc(#loc174) + %tmp6_62 = arith.addi %tmp6_61, %tmp0_35 : tensor<64x4xi32, #blocked1> loc(#loc175) + %tmp6_63 = tt.addptr %tmp0_36, %tmp6_62 : tensor<64x4x!tt.ptr, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc176) + %tmp6_64 = tt.load %tmp6_63, %tmp0_58, %cst_16 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked1> loc(#loc177) + %tmp6_65 = arith.extf %tmp6_64 : tensor<64x4xbf16, #blocked1> to tensor<64x4xf32, #blocked1> loc(#loc178) + %tmp2 = arith.mulf %tmp0_60, %tmp0_60 : tensor<64x4xf32, #blocked1> loc(#loc179) + %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<64x4xf32, #blocked1> loc(#loc180) + %_tmp4_66 = arith.select %tmp0_58, %tmp5, %_tmp4 : tensor<64x4xi1, #blocked1>, tensor<64x4xf32, #blocked1> loc(#loc181) + %tmp8 = arith.mulf %tmp6_65, %tmp6_65 : tensor<64x4xf32, #blocked1> loc(#loc182) + %tmp11 = arith.addf %_tmp10_51, %tmp8 : tensor<64x4xf32, #blocked1> loc(#loc183) + %_tmp10_67 = arith.select %tmp0_58, %tmp11, %_tmp10_51 : tensor<64x4xi1, #blocked1>, tensor<64x4xf32, #blocked1> loc(#loc184) + scf.yield %_tmp4_66, %_tmp10_67 : tensor<64x4xf32, #blocked1>, tensor<64x4xf32, #blocked1> loc(#loc31) + } loc(#loc290) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_51: f32 loc(callsite(#loc1 at #loc185)), %tmp4_52: f32 loc(callsite(#loc1 at #loc185))): + %tmp4_53 = arith.addf %tmp4_51, %tmp4_52 : f32 loc(#loc297) + tt.reduce.return %tmp4_53 : f32 loc(#loc291) + }) : (tensor<64x4xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291) + %tmp4_37 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc186) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_51: f32 loc(callsite(#loc1 at #loc187)), %tmp10_52: f32 loc(callsite(#loc1 at #loc187))): + %tmp10_53 = arith.addf %tmp10_51, %tmp10_52 : f32 loc(#loc298) + tt.reduce.return %tmp10_53 : f32 loc(#loc293) + }) : (tensor<64x4xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293) + %tmp10_38 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc188) + %tmp50 = arith.muli %x0, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc189) + %tmp50_39 = tt.broadcast %tmp50 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc190) + %tmp50_40 = arith.muli %x1, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc191) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc192) + %tmp50_42 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc193) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4x!tt.ptr, #blocked> loc(#loc194) + %tmp58_43 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4x!tt.ptr, #blocked1> loc(#loc194) + %tmp63 = arith.muli %x1_32, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc195) + %tmp63_44 = tt.broadcast %tmp63 : tensor<64x1xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc196) + %tmp63_45 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked1> loc(#loc197) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked1> loc(#loc198) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x4x!tt.ptr, #blocked> loc(#loc199) + %tmp102_46 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x4x!tt.ptr, #blocked1> loc(#loc199) + %tmp20 = arith.divf %tmp10_38, %cst_18 : tensor<64x1xf32, #blocked1> loc(#loc200) + %tmp22 = arith.addf %tmp20, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc201) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc202) + %tmp24 = ttg.convert_layout %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc203) + %tmp24_47 = tt.broadcast %tmp24 : tensor<64x1xf32, #blocked> -> tensor<64x4xf32, #blocked> loc(#loc203) + %tmp24_48 = tt.broadcast %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x4xf32, #blocked1> loc(#loc203) + %tmp72 = arith.divf %tmp4_37, %cst_18 : tensor<64x1xf32, #blocked1> loc(#loc204) + %tmp73 = arith.addf %tmp72, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc205) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc206) + %tmp75 = ttg.convert_layout %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc207) + %tmp75_49 = tt.broadcast %tmp75 : tensor<64x1xf32, #blocked> -> tensor<64x4xf32, #blocked> loc(#loc207) + %tmp75_50 = tt.broadcast %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x4xf32, #blocked1> loc(#loc207) + %0 = arith.muli %xindex_27, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc57) + %1 = tt.broadcast %0 : tensor<64x1xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc58) + %2 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked1> loc(#loc59) + %3 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked1> loc(#loc60) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc208) + %r0_index_51 = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked1> loc(#loc208) + %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x4xi32, #blocked> loc(#loc208) + %r0_index_53 = arith.addi %r0_index_51, %r0_base_30 : tensor<1x4xi32, #blocked1> loc(#loc208) + %r0_mask = arith.cmpi slt, %r0_index_52, %cst_8 : tensor<1x4xi32, #blocked> loc(#loc209) + %r0_mask_54 = arith.cmpi slt, %r0_index_53, %cst_13 : tensor<1x4xi32, #blocked1> loc(#loc209) + %r0_3 = arith.remsi %r0_index_52, %cst_2 : tensor<1x4xi32, #blocked> loc(#loc210) + %r0_4 = arith.divsi %r0_index_52, %cst_2 : tensor<1x4xi32, #blocked> loc(#loc211) + %tmp50_55 = tt.broadcast %r0_index_53 : tensor<1x4xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc190) + %tmp50_56 = arith.addi %tmp50_55, %tmp0_33 : tensor<64x4xi32, #blocked1> loc(#loc190) + %tmp50_57 = arith.addi %tmp50_56, %tmp0_35 : tensor<64x4xi32, #blocked1> loc(#loc192) + %tmp50_58 = tt.addptr %tmp0_36, %tmp50_57 : tensor<64x4x!tt.ptr, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc193) + %tmp50_59 = tt.broadcast %r0_mask_54 : tensor<1x4xi1, #blocked1> -> tensor<64x4xi1, #blocked1> loc(#loc212) + %tmp50_60 = tt.load %tmp50_58, %tmp50_59, %cst_16 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked1> loc(#loc212) + %tmp50_61 = arith.extf %tmp50_60 : tensor<64x4xbf16, #blocked1> to tensor<64x4xf32, #blocked1> loc(#loc213) + %tmp58_62 = tt.addptr %tmp58_43, %r0_index_53 : tensor<1x4x!tt.ptr, #blocked1>, tensor<1x4xi32, #blocked1> loc(#loc194) + %tmp58_63 = tt.load %tmp58_62, %r0_mask_54, %cst_15 evictionPolicy = evict_last : tensor<1x4x!tt.ptr, #blocked1> loc(#loc214) + %tmp58_64 = arith.extf %tmp58_63 : tensor<1x4xbf16, #blocked1> to tensor<1x4xf32, #blocked1> loc(#loc215) + %tmp63_65 = arith.addi %tmp50_55, %tmp63_44 : tensor<64x4xi32, #blocked1> loc(#loc196) + %tmp63_66 = tt.addptr %tmp63_45, %tmp63_65 : tensor<64x4x!tt.ptr, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc197) + %tmp63_67 = tt.load %tmp63_66, %tmp50_59, %cst_12 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked1> loc(#loc216) + %tmp66_68 = tt.addptr %tmp66, %tmp63_65 : tensor<64x4x!tt.ptr, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc198) + %tmp66_69 = tt.load %tmp66_68, %tmp50_59, %cst_12 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked1> loc(#loc217) + %tmp66_70 = ttg.convert_layout %tmp66_69 : tensor<64x4xf32, #blocked1> -> tensor<64x4xf32, #blocked> loc(#loc217) + %tmp96 = arith.addi %r0_index_53, %cst_14 : tensor<1x4xi32, #blocked1> loc(#loc218) + %tmp96_71 = tt.broadcast %tmp96 : tensor<1x4xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc219) + %tmp96_72 = arith.addi %tmp96_71, %tmp0_33 : tensor<64x4xi32, #blocked1> loc(#loc219) + %tmp96_73 = arith.addi %tmp96_72, %tmp0_35 : tensor<64x4xi32, #blocked1> loc(#loc220) + %tmp96_74 = tt.addptr %tmp0_36, %tmp96_73 : tensor<64x4x!tt.ptr, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc221) + %tmp96_75 = tt.load %tmp96_74, %tmp50_59, %cst_16 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked1> loc(#loc222) + %tmp96_76 = arith.extf %tmp96_75 : tensor<64x4xbf16, #blocked1> to tensor<64x4xf32, #blocked1> loc(#loc223) + %tmp102_77 = tt.addptr %tmp102_46, %r0_index_53 : tensor<1x4x!tt.ptr, #blocked1>, tensor<1x4xi32, #blocked1> loc(#loc199) + %tmp102_78 = tt.load %tmp102_77, %r0_mask_54, %cst_15 evictionPolicy = evict_last : tensor<1x4x!tt.ptr, #blocked1> loc(#loc224) + %tmp102_79 = arith.extf %tmp102_78 : tensor<1x4xbf16, #blocked1> to tensor<1x4xf32, #blocked1> loc(#loc225) + %tmp16 = arith.extsi %r0_3 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked> loc(#loc226) + %tmp16_80 = arith.cmpi slt, %tmp16, %cst_1 : tensor<1x4xi64, #blocked> loc(#loc226) + %tmp17 = arith.muli %r0_4, %cst_2 : tensor<1x4xi32, #blocked> loc(#loc227) + %tmp17_81 = arith.addi %tmp17, %cst_0 : tensor<1x4xi32, #blocked> loc(#loc228) + %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc229) + %tmp17_83 = arith.addi %tmp17_82, %tmp50_39 : tensor<64x4xi32, #blocked> loc(#loc229) + %tmp17_84 = arith.addi %tmp17_83, %tmp50_41 : tensor<64x4xi32, #blocked> loc(#loc230) + %tmp17_85 = tt.addptr %tmp50_42, %tmp17_84 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc231) + %tmp17_86 = arith.andi %r0_mask, %tmp16_80 : tensor<1x4xi1, #blocked> loc(#loc232) + %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc233) + %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked> loc(#loc233) + %tmp17_89 = arith.extf %tmp17_88 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc234) + %tmp24_90 = arith.mulf %tmp17_89, %tmp24_47 : tensor<64x4xf32, #blocked> loc(#loc203) + %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x4x!tt.ptr, #blocked>, tensor<1x4xi32, #blocked> loc(#loc235) + %tmp25_91 = tt.broadcast %tmp25 : tensor<1x4x!tt.ptr, #blocked> -> tensor<64x4x!tt.ptr, #blocked> loc(#loc235) + %tmp25_92 = tt.load %tmp25_91, %tmp17_87, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked> loc(#loc236) + %tmp25_93 = arith.extf %tmp25_92 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc237) + %tmp27 = arith.mulf %tmp24_90, %tmp25_93 : tensor<64x4xf32, #blocked> loc(#loc238) + %tmp29 = arith.subf %cst_19, %tmp27 : tensor<64x4xf32, #blocked> loc(#loc239) + %tmp31 = tt.broadcast %tmp16_80 : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc240) + %tmp32 = arith.cmpi sge, %tmp16, %cst_1 : tensor<1x4xi64, #blocked> loc(#loc241) + %tmp35 = tt.broadcast %tmp17 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc242) + %tmp35_94 = arith.addi %tmp35, %tmp50_39 : tensor<64x4xi32, #blocked> loc(#loc242) + %tmp35_95 = arith.addi %tmp35_94, %tmp50_41 : tensor<64x4xi32, #blocked> loc(#loc243) + %tmp35_96 = tt.addptr %tmp50_42, %tmp35_95 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc244) + %tmp35_97 = arith.andi %r0_mask, %tmp32 : tensor<1x4xi1, #blocked> loc(#loc245) + %tmp35_98 = tt.broadcast %tmp35_97 : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc246) + %tmp35_99 = tt.load %tmp35_96, %tmp35_98, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked> loc(#loc246) + %tmp35_100 = arith.extf %tmp35_99 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc247) + %tmp42 = arith.mulf %tmp35_100, %tmp24_47 : tensor<64x4xf32, #blocked> loc(#loc248) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x4x!tt.ptr, #blocked>, tensor<1x4xi32, #blocked> loc(#loc249) + %tmp43_101 = tt.broadcast %tmp43 : tensor<1x4x!tt.ptr, #blocked> -> tensor<64x4x!tt.ptr, #blocked> loc(#loc249) + %tmp43_102 = tt.load %tmp43_101, %tmp35_98, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked> loc(#loc250) + %tmp43_103 = arith.extf %tmp43_102 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc251) + %tmp45 = arith.mulf %tmp42, %tmp43_103 : tensor<64x4xf32, #blocked> loc(#loc252) + %tmp48 = tt.broadcast %tmp32 : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc253) + %tmp48_104 = arith.select %tmp48, %tmp45, %cst_19 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc253) + %tmp49 = arith.select %tmp31, %tmp29, %tmp48_104 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc295) + %tmp57 = arith.mulf %tmp50_61, %tmp24_48 : tensor<64x4xf32, #blocked1> loc(#loc255) + %tmp60 = tt.broadcast %tmp58_64 : tensor<1x4xf32, #blocked1> -> tensor<64x4xf32, #blocked1> loc(#loc256) + %tmp60_105 = arith.mulf %tmp57, %tmp60 : tensor<64x4xf32, #blocked1> loc(#loc256) + %tmp64 = arith.mulf %tmp60_105, %tmp63_67 : tensor<64x4xf32, #blocked1> loc(#loc257) + %tmp64_106 = ttg.convert_layout %tmp64 : tensor<64x4xf32, #blocked1> -> tensor<64x4xf32, #blocked> loc(#loc257) + %tmp67 = arith.mulf %tmp49, %tmp66_70 : tensor<64x4xf32, #blocked> loc(#loc258) + %tmp68 = arith.addf %tmp64_106, %tmp67 : tensor<64x4xf32, #blocked> loc(#loc259) + %tmp70 = arith.addi %tmp17, %cst : tensor<1x4xi32, #blocked> loc(#loc260) + %tmp70_107 = tt.broadcast %tmp70 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc261) + %tmp70_108 = arith.addi %tmp70_107, %tmp50_39 : tensor<64x4xi32, #blocked> loc(#loc261) + %tmp70_109 = arith.addi %tmp70_108, %tmp50_41 : tensor<64x4xi32, #blocked> loc(#loc262) + %tmp70_110 = tt.addptr %tmp50_42, %tmp70_109 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc263) + %tmp70_111 = tt.load %tmp70_110, %tmp17_87, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked> loc(#loc264) + %tmp70_112 = arith.extf %tmp70_111 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc265) + %tmp75_113 = arith.mulf %tmp70_112, %tmp75_49 : tensor<64x4xf32, #blocked> loc(#loc207) + %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x4x!tt.ptr, #blocked>, tensor<1x4xi32, #blocked> loc(#loc266) + %tmp76_114 = tt.broadcast %tmp76 : tensor<1x4x!tt.ptr, #blocked> -> tensor<64x4x!tt.ptr, #blocked> loc(#loc266) + %tmp76_115 = tt.load %tmp76_114, %tmp17_87, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked> loc(#loc267) + %tmp76_116 = arith.extf %tmp76_115 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc268) + %tmp78 = arith.mulf %tmp75_113, %tmp76_116 : tensor<64x4xf32, #blocked> loc(#loc269) + %tmp80 = arith.subf %cst_19, %tmp78 : tensor<64x4xf32, #blocked> loc(#loc270) + %tmp83 = arith.addi %tmp17, %cst_7 : tensor<1x4xi32, #blocked> loc(#loc271) + %tmp83_117 = tt.broadcast %tmp83 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc272) + %tmp83_118 = arith.addi %tmp83_117, %tmp50_39 : tensor<64x4xi32, #blocked> loc(#loc272) + %tmp83_119 = arith.addi %tmp83_118, %tmp50_41 : tensor<64x4xi32, #blocked> loc(#loc273) + %tmp83_120 = tt.addptr %tmp50_42, %tmp83_119 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc274) + %tmp83_121 = tt.load %tmp83_120, %tmp35_98, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked> loc(#loc275) + %tmp83_122 = arith.extf %tmp83_121 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc276) + %tmp88 = arith.mulf %tmp83_122, %tmp75_49 : tensor<64x4xf32, #blocked> loc(#loc277) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x4x!tt.ptr, #blocked>, tensor<1x4xi32, #blocked> loc(#loc278) + %tmp89_123 = tt.broadcast %tmp89 : tensor<1x4x!tt.ptr, #blocked> -> tensor<64x4x!tt.ptr, #blocked> loc(#loc278) + %tmp89_124 = tt.load %tmp89_123, %tmp35_98, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr, #blocked> loc(#loc279) + %tmp89_125 = arith.extf %tmp89_124 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc280) + %tmp91 = arith.mulf %tmp88, %tmp89_125 : tensor<64x4xf32, #blocked> loc(#loc281) + %tmp94 = arith.select %tmp48, %tmp91, %cst_19 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc282) + %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc296) + %tmp101 = arith.mulf %tmp96_76, %tmp75_50 : tensor<64x4xf32, #blocked1> loc(#loc285) + %tmp104 = tt.broadcast %tmp102_79 : tensor<1x4xf32, #blocked1> -> tensor<64x4xf32, #blocked1> loc(#loc286) + %tmp104_126 = arith.mulf %tmp101, %tmp104 : tensor<64x4xf32, #blocked1> loc(#loc286) + %tmp107 = arith.mulf %tmp104_126, %tmp63_67 : tensor<64x4xf32, #blocked1> loc(#loc287) + %tmp107_127 = ttg.convert_layout %tmp107 : tensor<64x4xf32, #blocked1> -> tensor<64x4xf32, #blocked> loc(#loc287) + %tmp109 = arith.mulf %tmp95, %tmp66_70 : tensor<64x4xf32, #blocked> loc(#loc288) + %tmp110 = arith.addf %tmp107_127, %tmp109 : tensor<64x4xf32, #blocked> loc(#loc289) + %4 = arith.addi %tmp50_55, %1 : tensor<64x4xi32, #blocked1> loc(#loc58) + %5 = tt.addptr %2, %4 : tensor<64x4x!tt.ptr, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc59) + %6 = arith.truncf %tmp68 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked> loc(#loc144) + %7 = ttg.convert_layout %6 : tensor<64x4xbf16, #blocked> -> tensor<64x4xbf16, #blocked1> loc(#loc144) + tt.store %5, %7, %tmp50_59 : tensor<64x4x!tt.ptr, #blocked1> loc(#loc144) + %8 = tt.addptr %3, %4 : tensor<64x4x!tt.ptr, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc60) + %9 = arith.truncf %tmp110 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked> loc(#loc145) + %10 = ttg.convert_layout %9 : tensor<64x4xbf16, #blocked> -> tensor<64x4xbf16, #blocked1> loc(#loc145) + tt.store %8, %10, %tmp50_59 : tensor<64x4x!tt.ptr, #blocked1> loc(#loc145) + } loc(#loc61) + tt.return loc(#loc146) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc156 = loc("xoffset"(#loc2)) +#loc157 = loc("xoffset"(#loc3)) +#loc158 = loc("xindex"(#loc4)) +#loc159 = loc("xindex"(#loc5)) +#loc160 = loc("r0_base"(#loc6)) +#loc161 = loc("x0"(#loc7)) +#loc162 = loc("x1"(#loc8)) +#loc163 = loc("tmp0"(#loc9)) +#loc164 = loc("tmp0"(#loc10)) +#loc165 = loc("tmp0"(#loc11)) +#loc166 = loc("tmp0"(#loc12)) +#loc167 = loc("tmp0"(#loc13)) +#loc168 = loc("_tmp4"(#loc14)) +#loc169 = loc("r0_index"(#loc15)) +#loc170 = loc("r0_mask"(#loc16)) +#loc171 = loc("tmp0"(#loc17)) +#loc172 = loc("tmp0"(#loc18)) +#loc173 = loc("tmp0"(#loc19)) +#loc174 = loc("tmp6"(#loc20)) +#loc175 = loc("tmp6"(#loc21)) +#loc176 = loc("tmp6"(#loc22)) +#loc177 = loc("tmp6"(#loc23)) +#loc178 = loc("tmp6"(#loc24)) +#loc179 = loc("tmp2"(#loc25)) +#loc180 = loc("tmp5"(#loc26)) +#loc181 = loc("_tmp4"(#loc27)) +#loc182 = loc("tmp8"(#loc28)) +#loc183 = loc("tmp11"(#loc29)) +#loc184 = loc("_tmp10"(#loc30)) +#loc186 = loc("tmp4"(#loc35)) +#loc188 = loc("tmp10"(#loc37)) +#loc189 = loc("tmp50"(#loc38)) +#loc190 = loc("tmp50"(#loc39)) +#loc191 = loc("tmp50"(#loc40)) +#loc192 = loc("tmp50"(#loc41)) +#loc193 = loc("tmp50"(#loc42)) +#loc194 = loc("tmp58"(#loc43)) +#loc195 = loc("tmp63"(#loc44)) +#loc196 = loc("tmp63"(#loc45)) +#loc197 = loc("tmp63"(#loc46)) +#loc198 = loc("tmp66"(#loc47)) +#loc199 = loc("tmp102"(#loc48)) +#loc200 = loc("tmp20"(#loc49)) +#loc201 = loc("tmp22"(#loc50)) +#loc202 = loc("tmp23"(#loc51)) +#loc203 = loc("tmp24"(#loc52)) +#loc204 = loc("tmp72"(#loc53)) +#loc205 = loc("tmp73"(#loc54)) +#loc206 = loc("tmp74"(#loc55)) +#loc207 = loc("tmp75"(#loc56)) +#loc208 = loc("r0_index"(#loc62)) +#loc209 = loc("r0_mask"(#loc63)) +#loc210 = loc("r0_3"(#loc64)) +#loc211 = loc("r0_4"(#loc65)) +#loc212 = loc("tmp50"(#loc66)) +#loc213 = loc("tmp50"(#loc67)) +#loc214 = loc("tmp58"(#loc68)) +#loc215 = loc("tmp58"(#loc69)) +#loc216 = loc("tmp63"(#loc70)) +#loc217 = loc("tmp66"(#loc71)) +#loc218 = loc("tmp96"(#loc72)) +#loc219 = loc("tmp96"(#loc73)) +#loc220 = loc("tmp96"(#loc74)) +#loc221 = loc("tmp96"(#loc75)) +#loc222 = loc("tmp96"(#loc76)) +#loc223 = loc("tmp96"(#loc77)) +#loc224 = loc("tmp102"(#loc78)) +#loc225 = loc("tmp102"(#loc79)) +#loc226 = loc("tmp16"(#loc80)) +#loc227 = loc("tmp17"(#loc81)) +#loc228 = loc("tmp17"(#loc82)) +#loc229 = loc("tmp17"(#loc83)) +#loc230 = loc("tmp17"(#loc84)) +#loc231 = loc("tmp17"(#loc85)) +#loc232 = loc("tmp17"(#loc86)) +#loc233 = loc("tmp17"(#loc87)) +#loc234 = loc("tmp17"(#loc88)) +#loc235 = loc("tmp25"(#loc89)) +#loc236 = loc("tmp25"(#loc90)) +#loc237 = loc("tmp25"(#loc91)) +#loc238 = loc("tmp27"(#loc92)) +#loc239 = loc("tmp29"(#loc93)) +#loc240 = loc("tmp31"(#loc94)) +#loc241 = loc("tmp32"(#loc95)) +#loc242 = loc("tmp35"(#loc96)) +#loc243 = loc("tmp35"(#loc97)) +#loc244 = loc("tmp35"(#loc98)) +#loc245 = loc("tmp35"(#loc99)) +#loc246 = loc("tmp35"(#loc100)) +#loc247 = loc("tmp35"(#loc101)) +#loc248 = loc("tmp42"(#loc102)) +#loc249 = loc("tmp43"(#loc103)) +#loc250 = loc("tmp43"(#loc104)) +#loc251 = loc("tmp43"(#loc105)) +#loc252 = loc("tmp45"(#loc106)) +#loc253 = loc("tmp48"(#loc107)) +#loc254 = loc("tmp49"(#loc108)) +#loc255 = loc("tmp57"(#loc109)) +#loc256 = loc("tmp60"(#loc110)) +#loc257 = loc("tmp64"(#loc111)) +#loc258 = loc("tmp67"(#loc112)) +#loc259 = loc("tmp68"(#loc113)) +#loc260 = loc("tmp70"(#loc114)) +#loc261 = loc("tmp70"(#loc115)) +#loc262 = loc("tmp70"(#loc116)) +#loc263 = loc("tmp70"(#loc117)) +#loc264 = loc("tmp70"(#loc118)) +#loc265 = loc("tmp70"(#loc119)) +#loc266 = loc("tmp76"(#loc120)) +#loc267 = loc("tmp76"(#loc121)) +#loc268 = loc("tmp76"(#loc122)) +#loc269 = loc("tmp78"(#loc123)) +#loc270 = loc("tmp80"(#loc124)) +#loc271 = loc("tmp83"(#loc125)) +#loc272 = loc("tmp83"(#loc126)) +#loc273 = loc("tmp83"(#loc127)) +#loc274 = loc("tmp83"(#loc128)) +#loc275 = loc("tmp83"(#loc129)) +#loc276 = loc("tmp83"(#loc130)) +#loc277 = loc("tmp88"(#loc131)) +#loc278 = loc("tmp89"(#loc132)) +#loc279 = loc("tmp89"(#loc133)) +#loc280 = loc("tmp89"(#loc134)) +#loc281 = loc("tmp91"(#loc135)) +#loc282 = loc("tmp94"(#loc136)) +#loc283 = loc("tmp95"(#loc137)) +#loc284 = loc("tmp82"(#loc138)) +#loc285 = loc("tmp101"(#loc139)) +#loc286 = loc("tmp104"(#loc140)) +#loc287 = loc("tmp107"(#loc141)) +#loc288 = loc("tmp109"(#loc142)) +#loc289 = loc("tmp110"(#loc143)) +#loc290 = loc("_tmp10"(#loc168)) +#loc291 = loc(callsite(#loc32 at #loc185)) +#loc293 = loc(callsite(#loc32 at #loc187)) +#loc295 = loc(fused[#loc254, #loc240]) +#loc296 = loc(fused[#loc283, #loc284]) +#loc297 = loc(callsite(#loc34 at #loc291)) +#loc298 = loc(callsite(#loc34 at #loc293)) diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..998230c6cf24ff4e180b20c65181e154eb0a780b --- /dev/null +++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir @@ -0,0 +1,520 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc149 = loc("in_out_ptr0"(#loc)) +#loc150 = loc("in_out_ptr1"(#loc)) +#loc151 = loc("in_ptr0"(#loc)) +#loc152 = loc("in_ptr1"(#loc)) +#loc153 = loc("in_ptr2"(#loc)) +#loc154 = loc("in_ptr3"(#loc)) +#loc155 = loc("in_ptr4"(#loc)) +#loc156 = loc("xnumel"(#loc)) +#loc157 = loc("r0_numel"(#loc)) +#loc189 = loc("tmp4"(#loc35)) +#loc191 = loc("tmp10"(#loc38)) +#loc296 = loc(callsite(#loc1 at #loc189)) +#loc298 = loc(callsite(#loc1 at #loc191)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x4xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_1 = arith.constant dense<4097> : tensor<1x4xi32> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x4xi32> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x4xi64> loc(#loc1) + %cst_6 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc1) + %cst_7 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x4xi32> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x4xi32> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc158) + %xoffset_13 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc159) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc160) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc161) + %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<64x1xi32> loc(#loc162) + %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<64x1xi32> loc(#loc162) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc163) + %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc164) + %x0 = arith.remsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc165) + %x1 = arith.divsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc166) + %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_20 = %cst_11) -> (tensor<64x4xf32>, tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc168) + %r0_index_21 = arith.addi %r0_index, %r0_base_17 : tensor<1x4xi32> loc(#loc168) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst_10 : tensor<1x4xi32> loc(#loc169) + %tmp0 = arith.addi %r0_index_21, %cst_9 : tensor<1x4xi32> loc(#loc170) + %tmp0_22 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc171) + %tmp0_23 = tt.broadcast %tmp0 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc172) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc172) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<64x4xi32> loc(#loc172) + %tmp0_26 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc173) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc174) + %tmp0_28 = arith.addi %tmp0_25, %tmp0_27 : tensor<64x4xi32> loc(#loc174) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc175) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc175) + %tmp0_31 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc176) + %tmp0_32 = tt.load %tmp0_30, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc176) + %tmp0_33 = arith.extf %tmp0_32 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc177) + %tmp6 = tt.broadcast %r0_index_21 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc178) + %tmp6_34 = arith.addi %tmp6, %tmp0_24 : tensor<64x4xi32> loc(#loc178) + %tmp6_35 = arith.addi %tmp6_34, %tmp0_27 : tensor<64x4xi32> loc(#loc179) + %tmp6_36 = tt.addptr %tmp0_29, %tmp6_35 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc180) + %tmp6_37 = tt.load %tmp6_36, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc181) + %tmp6_38 = arith.extf %tmp6_37 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc182) + %tmp2 = arith.mulf %tmp0_33, %tmp0_33 : tensor<64x4xf32> loc(#loc183) + %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<64x4xf32> loc(#loc184) + %_tmp4_39 = arith.select %tmp0_31, %tmp5, %_tmp4 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc185) + %tmp8 = arith.mulf %tmp6_38, %tmp6_38 : tensor<64x4xf32> loc(#loc186) + %tmp11 = arith.addf %_tmp10_20, %tmp8 : tensor<64x4xf32> loc(#loc187) + %_tmp10_40 = arith.select %tmp0_31, %tmp11, %_tmp10_20 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc188) + scf.yield %_tmp4_39, %_tmp10_40 : tensor<64x4xf32>, tensor<64x4xf32> loc(#loc33) + } loc(#loc294) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc189)), %tmp4_21: f32 loc(callsite(#loc1 at #loc189))): + %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc299) + tt.reduce.return %tmp4_22 : f32 loc(#loc295) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc295) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc190) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_20: f32 loc(callsite(#loc1 at #loc191)), %tmp10_21: f32 loc(callsite(#loc1 at #loc191))): + %tmp10_22 = arith.addf %tmp10_20, %tmp10_21 : f32 loc(#loc300) + tt.reduce.return %tmp10_22 : f32 loc(#loc297) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc297) + %tmp10_19 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc192) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc193) + %r0_index_20 = arith.addi %r0_index, %r0_base_17 : tensor<1x4xi32> loc(#loc193) + %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x4xi32> loc(#loc194) + %r0_3 = arith.remsi %r0_index_20, %cst_6 : tensor<1x4xi32> loc(#loc195) + %r0_4 = arith.divsi %r0_index_20, %cst_6 : tensor<1x4xi32> loc(#loc196) + %tmp50 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc197) + %tmp50_21 = tt.broadcast %r0_index_20 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc198) + %tmp50_22 = tt.broadcast %tmp50 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc198) + %tmp50_23 = arith.addi %tmp50_21, %tmp50_22 : tensor<64x4xi32> loc(#loc198) + %tmp50_24 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc199) + %tmp50_25 = tt.broadcast %tmp50_24 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc200) + %tmp50_26 = arith.addi %tmp50_23, %tmp50_25 : tensor<64x4xi32> loc(#loc200) + %tmp50_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc201) + %tmp50_28 = tt.addptr %tmp50_27, %tmp50_26 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc201) + %tmp50_29 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc202) + %tmp50_30 = tt.load %tmp50_28, %tmp50_29, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc202) + %tmp50_31 = arith.extf %tmp50_30 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc203) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4x!tt.ptr> loc(#loc204) + %tmp58_32 = tt.addptr %tmp58, %r0_index_20 : tensor<1x4x!tt.ptr>, tensor<1x4xi32> loc(#loc204) + %tmp58_33 = tt.load %tmp58_32, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x4x!tt.ptr> loc(#loc205) + %tmp58_34 = arith.extf %tmp58_33 : tensor<1x4xbf16> to tensor<1x4xf32> loc(#loc206) + %tmp63 = arith.muli %x1, %cst_8 : tensor<64x1xi32> loc(#loc207) + %tmp63_35 = tt.broadcast %tmp63 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc208) + %tmp63_36 = arith.addi %tmp50_21, %tmp63_35 : tensor<64x4xi32> loc(#loc208) + %tmp63_37 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc209) + %tmp63_38 = tt.addptr %tmp63_37, %tmp63_36 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc209) + %tmp63_39 = tt.load %tmp63_38, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc210) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc211) + %tmp66_40 = tt.addptr %tmp66, %tmp63_36 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc211) + %tmp66_41 = tt.load %tmp66_40, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc212) + %tmp96 = arith.addi %r0_index_20, %cst_9 : tensor<1x4xi32> loc(#loc213) + %tmp96_42 = tt.broadcast %tmp96 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc214) + %tmp96_43 = arith.addi %tmp96_42, %tmp50_22 : tensor<64x4xi32> loc(#loc214) + %tmp96_44 = arith.addi %tmp96_43, %tmp50_25 : tensor<64x4xi32> loc(#loc215) + %tmp96_45 = tt.addptr %tmp50_27, %tmp96_44 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc216) + %tmp96_46 = tt.load %tmp96_45, %tmp50_29, %cst_0 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc217) + %tmp96_47 = arith.extf %tmp96_46 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc218) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x4x!tt.ptr> loc(#loc219) + %tmp102_48 = tt.addptr %tmp102, %r0_index_20 : tensor<1x4x!tt.ptr>, tensor<1x4xi32> loc(#loc219) + %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x4x!tt.ptr> loc(#loc220) + %tmp102_50 = arith.extf %tmp102_49 : tensor<1x4xbf16> to tensor<1x4xf32> loc(#loc221) + %tmp16 = arith.extsi %r0_3 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc222) + %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x4xi64> loc(#loc222) + %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x4xi32> loc(#loc223) + %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x4xi32> loc(#loc224) + %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc225) + %tmp17_54 = arith.addi %tmp17_53, %tmp50_22 : tensor<64x4xi32> loc(#loc225) + %tmp17_55 = arith.addi %tmp17_54, %tmp50_25 : tensor<64x4xi32> loc(#loc226) + %tmp17_56 = tt.addptr %tmp50_27, %tmp17_55 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc227) + %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x4xi1> loc(#loc228) + %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc229) + %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc229) + %tmp17_60 = arith.extf %tmp17_59 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc230) + %tmp20 = arith.divf %tmp10_19, %cst_3 : tensor<64x1xf32> loc(#loc231) + %tmp22 = arith.addf %tmp20, %cst_2 : tensor<64x1xf32> loc(#loc232) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc233) + %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc234) + %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<64x4xf32> loc(#loc234) + %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x4x!tt.ptr>, tensor<1x4xi32> loc(#loc235) + %tmp25_62 = tt.broadcast %tmp25 : tensor<1x4x!tt.ptr> -> tensor<64x4x!tt.ptr> loc(#loc235) + %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc236) + %tmp25_64 = arith.extf %tmp25_63 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc237) + %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<64x4xf32> loc(#loc238) + %tmp29 = arith.subf %cst_11, %tmp27 : tensor<64x4xf32> loc(#loc239) + %tmp31 = tt.broadcast %tmp16_51 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc240) + %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc240) + %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x4xi64> loc(#loc241) + %tmp35 = tt.broadcast %tmp17 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc242) + %tmp35_66 = arith.addi %tmp35, %tmp50_22 : tensor<64x4xi32> loc(#loc242) + %tmp35_67 = arith.addi %tmp35_66, %tmp50_25 : tensor<64x4xi32> loc(#loc243) + %tmp35_68 = tt.addptr %tmp50_27, %tmp35_67 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc244) + %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x4xi1> loc(#loc245) + %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc246) + %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc246) + %tmp35_72 = arith.extf %tmp35_71 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc247) + %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<64x4xf32> loc(#loc248) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x4x!tt.ptr>, tensor<1x4xi32> loc(#loc249) + %tmp43_73 = tt.broadcast %tmp43 : tensor<1x4x!tt.ptr> -> tensor<64x4x!tt.ptr> loc(#loc249) + %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc250) + %tmp43_75 = arith.extf %tmp43_74 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc251) + %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<64x4xf32> loc(#loc252) + %tmp48 = tt.broadcast %tmp32 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc253) + %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc253) + %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc254) + %tmp57 = arith.mulf %tmp50_31, %tmp24 : tensor<64x4xf32> loc(#loc255) + %tmp60 = tt.broadcast %tmp58_34 : tensor<1x4xf32> -> tensor<64x4xf32> loc(#loc256) + %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<64x4xf32> loc(#loc256) + %tmp64 = arith.mulf %tmp60_77, %tmp63_39 : tensor<64x4xf32> loc(#loc257) + %tmp67 = arith.mulf %tmp49, %tmp66_41 : tensor<64x4xf32> loc(#loc258) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x4xf32> loc(#loc259) + %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x4xi32> loc(#loc260) + %tmp70_78 = tt.broadcast %tmp70 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc261) + %tmp70_79 = arith.addi %tmp70_78, %tmp50_22 : tensor<64x4xi32> loc(#loc261) + %tmp70_80 = arith.addi %tmp70_79, %tmp50_25 : tensor<64x4xi32> loc(#loc262) + %tmp70_81 = tt.addptr %tmp50_27, %tmp70_80 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc263) + %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc264) + %tmp70_83 = arith.extf %tmp70_82 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc265) + %tmp72 = arith.divf %tmp4_18, %cst_3 : tensor<64x1xf32> loc(#loc266) + %tmp73 = arith.addf %tmp72, %cst_2 : tensor<64x1xf32> loc(#loc267) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc268) + %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc269) + %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<64x4xf32> loc(#loc269) + %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x4x!tt.ptr>, tensor<1x4xi32> loc(#loc270) + %tmp76_85 = tt.broadcast %tmp76 : tensor<1x4x!tt.ptr> -> tensor<64x4x!tt.ptr> loc(#loc270) + %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc271) + %tmp76_87 = arith.extf %tmp76_86 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc272) + %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<64x4xf32> loc(#loc273) + %tmp80 = arith.subf %cst_11, %tmp78 : tensor<64x4xf32> loc(#loc274) + %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc275) + %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x4xi32> loc(#loc276) + %tmp83_88 = tt.broadcast %tmp83 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc277) + %tmp83_89 = arith.addi %tmp83_88, %tmp50_22 : tensor<64x4xi32> loc(#loc277) + %tmp83_90 = arith.addi %tmp83_89, %tmp50_25 : tensor<64x4xi32> loc(#loc278) + %tmp83_91 = tt.addptr %tmp50_27, %tmp83_90 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc279) + %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc280) + %tmp83_93 = arith.extf %tmp83_92 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc281) + %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<64x4xf32> loc(#loc282) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x4x!tt.ptr>, tensor<1x4xi32> loc(#loc283) + %tmp89_94 = tt.broadcast %tmp89 : tensor<1x4x!tt.ptr> -> tensor<64x4x!tt.ptr> loc(#loc283) + %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr> loc(#loc284) + %tmp89_96 = arith.extf %tmp89_95 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc285) + %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<64x4xf32> loc(#loc286) + %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc287) + %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc288) + %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<64x4xf32> loc(#loc289) + %tmp104 = tt.broadcast %tmp102_50 : tensor<1x4xf32> -> tensor<64x4xf32> loc(#loc290) + %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<64x4xf32> loc(#loc290) + %tmp107 = arith.mulf %tmp104_97, %tmp63_39 : tensor<64x4xf32> loc(#loc291) + %tmp109 = arith.mulf %tmp95, %tmp66_41 : tensor<64x4xf32> loc(#loc292) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x4xf32> loc(#loc293) + %0 = arith.muli %xindex_16, %cst_8 : tensor<64x1xi32> loc(#loc142) + %1 = tt.broadcast %0 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc143) + %2 = arith.addi %tmp50_21, %1 : tensor<64x4xi32> loc(#loc143) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc144) + %4 = tt.addptr %3, %2 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc144) + %5 = arith.truncf %tmp68 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc145) + tt.store %4, %5, %tmp50_29 : tensor<64x4x!tt.ptr> loc(#loc145) + %6 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc146) + %7 = tt.addptr %6, %2 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc146) + %8 = arith.truncf %tmp110 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc147) + tt.store %7, %8, %tmp50_29 : tensor<64x4x!tt.ptr> loc(#loc147) + } loc(#loc40) + tt.return loc(#loc148) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc158 = loc("xoffset"(#loc2)) +#loc159 = loc("xoffset"(#loc3)) +#loc160 = loc("xindex"(#loc4)) +#loc161 = loc("xindex"(#loc5)) +#loc162 = loc("xindex"(#loc6)) +#loc163 = loc("r0_base"(#loc7)) +#loc164 = loc("r0_base"(#loc8)) +#loc165 = loc("x0"(#loc9)) +#loc166 = loc("x1"(#loc10)) +#loc167 = loc("_tmp4"(#loc11)) +#loc168 = loc("r0_index"(#loc12)) +#loc169 = loc("r0_mask"(#loc13)) +#loc170 = loc("tmp0"(#loc14)) +#loc171 = loc("tmp0"(#loc15)) +#loc172 = loc("tmp0"(#loc16)) +#loc173 = loc("tmp0"(#loc17)) +#loc174 = loc("tmp0"(#loc18)) +#loc175 = loc("tmp0"(#loc19)) +#loc176 = loc("tmp0"(#loc20)) +#loc177 = loc("tmp0"(#loc21)) +#loc178 = loc("tmp6"(#loc22)) +#loc179 = loc("tmp6"(#loc23)) +#loc180 = loc("tmp6"(#loc24)) +#loc181 = loc("tmp6"(#loc25)) +#loc182 = loc("tmp6"(#loc26)) +#loc183 = loc("tmp2"(#loc27)) +#loc184 = loc("tmp5"(#loc28)) +#loc185 = loc("_tmp4"(#loc29)) +#loc186 = loc("tmp8"(#loc30)) +#loc187 = loc("tmp11"(#loc31)) +#loc188 = loc("_tmp10"(#loc32)) +#loc190 = loc("tmp4"(#loc37)) +#loc192 = loc("tmp10"(#loc39)) +#loc193 = loc("r0_index"(#loc41)) +#loc194 = loc("r0_mask"(#loc42)) +#loc195 = loc("r0_3"(#loc43)) +#loc196 = loc("r0_4"(#loc44)) +#loc197 = loc("tmp50"(#loc45)) +#loc198 = loc("tmp50"(#loc46)) +#loc199 = loc("tmp50"(#loc47)) +#loc200 = loc("tmp50"(#loc48)) +#loc201 = loc("tmp50"(#loc49)) +#loc202 = loc("tmp50"(#loc50)) +#loc203 = loc("tmp50"(#loc51)) +#loc204 = loc("tmp58"(#loc52)) +#loc205 = loc("tmp58"(#loc53)) +#loc206 = loc("tmp58"(#loc54)) +#loc207 = loc("tmp63"(#loc55)) +#loc208 = loc("tmp63"(#loc56)) +#loc209 = loc("tmp63"(#loc57)) +#loc210 = loc("tmp63"(#loc58)) +#loc211 = loc("tmp66"(#loc59)) +#loc212 = loc("tmp66"(#loc60)) +#loc213 = loc("tmp96"(#loc61)) +#loc214 = loc("tmp96"(#loc62)) +#loc215 = loc("tmp96"(#loc63)) +#loc216 = loc("tmp96"(#loc64)) +#loc217 = loc("tmp96"(#loc65)) +#loc218 = loc("tmp96"(#loc66)) +#loc219 = loc("tmp102"(#loc67)) +#loc220 = loc("tmp102"(#loc68)) +#loc221 = loc("tmp102"(#loc69)) +#loc222 = loc("tmp16"(#loc70)) +#loc223 = loc("tmp17"(#loc71)) +#loc224 = loc("tmp17"(#loc72)) +#loc225 = loc("tmp17"(#loc73)) +#loc226 = loc("tmp17"(#loc74)) +#loc227 = loc("tmp17"(#loc75)) +#loc228 = loc("tmp17"(#loc76)) +#loc229 = loc("tmp17"(#loc77)) +#loc230 = loc("tmp17"(#loc78)) +#loc231 = loc("tmp20"(#loc79)) +#loc232 = loc("tmp22"(#loc80)) +#loc233 = loc("tmp23"(#loc81)) +#loc234 = loc("tmp24"(#loc82)) +#loc235 = loc("tmp25"(#loc83)) +#loc236 = loc("tmp25"(#loc84)) +#loc237 = loc("tmp25"(#loc85)) +#loc238 = loc("tmp27"(#loc86)) +#loc239 = loc("tmp29"(#loc87)) +#loc240 = loc("tmp31"(#loc88)) +#loc241 = loc("tmp32"(#loc89)) +#loc242 = loc("tmp35"(#loc90)) +#loc243 = loc("tmp35"(#loc91)) +#loc244 = loc("tmp35"(#loc92)) +#loc245 = loc("tmp35"(#loc93)) +#loc246 = loc("tmp35"(#loc94)) +#loc247 = loc("tmp35"(#loc95)) +#loc248 = loc("tmp42"(#loc96)) +#loc249 = loc("tmp43"(#loc97)) +#loc250 = loc("tmp43"(#loc98)) +#loc251 = loc("tmp43"(#loc99)) +#loc252 = loc("tmp45"(#loc100)) +#loc253 = loc("tmp48"(#loc101)) +#loc254 = loc("tmp49"(#loc102)) +#loc255 = loc("tmp57"(#loc103)) +#loc256 = loc("tmp60"(#loc104)) +#loc257 = loc("tmp64"(#loc105)) +#loc258 = loc("tmp67"(#loc106)) +#loc259 = loc("tmp68"(#loc107)) +#loc260 = loc("tmp70"(#loc108)) +#loc261 = loc("tmp70"(#loc109)) +#loc262 = loc("tmp70"(#loc110)) +#loc263 = loc("tmp70"(#loc111)) +#loc264 = loc("tmp70"(#loc112)) +#loc265 = loc("tmp70"(#loc113)) +#loc266 = loc("tmp72"(#loc114)) +#loc267 = loc("tmp73"(#loc115)) +#loc268 = loc("tmp74"(#loc116)) +#loc269 = loc("tmp75"(#loc117)) +#loc270 = loc("tmp76"(#loc118)) +#loc271 = loc("tmp76"(#loc119)) +#loc272 = loc("tmp76"(#loc120)) +#loc273 = loc("tmp78"(#loc121)) +#loc274 = loc("tmp80"(#loc122)) +#loc275 = loc("tmp82"(#loc123)) +#loc276 = loc("tmp83"(#loc124)) +#loc277 = loc("tmp83"(#loc125)) +#loc278 = loc("tmp83"(#loc126)) +#loc279 = loc("tmp83"(#loc127)) +#loc280 = loc("tmp83"(#loc128)) +#loc281 = loc("tmp83"(#loc129)) +#loc282 = loc("tmp88"(#loc130)) +#loc283 = loc("tmp89"(#loc131)) +#loc284 = loc("tmp89"(#loc132)) +#loc285 = loc("tmp89"(#loc133)) +#loc286 = loc("tmp91"(#loc134)) +#loc287 = loc("tmp94"(#loc135)) +#loc288 = loc("tmp95"(#loc136)) +#loc289 = loc("tmp101"(#loc137)) +#loc290 = loc("tmp104"(#loc138)) +#loc291 = loc("tmp107"(#loc139)) +#loc292 = loc("tmp109"(#loc140)) +#loc293 = loc("tmp110"(#loc141)) +#loc294 = loc("_tmp10"(#loc167)) +#loc295 = loc(callsite(#loc34 at #loc189)) +#loc297 = loc(callsite(#loc34 at #loc191)) +#loc299 = loc(callsite(#loc36 at #loc295)) +#loc300 = loc(callsite(#loc36 at #loc297)) diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f79f09dd17622a1967646779fa828a249f89f662 --- /dev/null +++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}} \ No newline at end of file diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..3a9918e0230ef35f33f532cb18b983cff0d23ee8 Binary files /dev/null and b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7025dac46dadd80b0c45f6c7c29d139ce2c87572 --- /dev/null +++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"hash": "e08019044dda6d34ba23cdc41eb493aa27e07b83eaccdd40f499225377049aa6", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 512, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"} \ No newline at end of file diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..9264a49f978f7b0ec4d99159f44c7c6c55fae03b --- /dev/null +++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir @@ -0,0 +1,688 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %13 = shl nuw i32 %12, 1, !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %15 = and i32 %14, 32, !dbg !10 + %.lobit = lshr exact i32 %15, 5, !dbg !10 + %16 = or disjoint i32 %.lobit, %13, !dbg !11 + %17 = and i32 %14, 31, !dbg !12 + %18 = shl nuw nsw i32 %17, 1, !dbg !12 + %19 = sdiv i32 %16, 32, !dbg !13 + %20 = shl i32 %16, 7 + %21 = shl i32 %19, 15 + %22 = add i32 %21, %20 + %23 = add i32 %22, 4096 + %24 = zext nneg i32 %18 to i64, !dbg !14 + %25 = or disjoint i32 %23, %18, !dbg !15 + %26 = sext i32 %25 to i64, !dbg !16 + %27 = getelementptr bfloat, ptr addrspace(1) %2, i64 %26, !dbg !16 + %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %29 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %27, i64 %28, i1 true) #6, !dbg !17 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !17 + %31 = extractelement <2 x bfloat> %30, i64 0, !dbg !17 + %32 = extractelement <2 x bfloat> %30, i64 1, !dbg !17 + %33 = fpext bfloat %31 to float, !dbg !18 + %34 = fpext bfloat %32 to float, !dbg !18 + %35 = or disjoint i32 %22, %18, !dbg !19 + %36 = sext i32 %35 to i64, !dbg !20 + %37 = getelementptr bfloat, ptr addrspace(1) %2, i64 %36, !dbg !20 + %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %39 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %37, i64 %38, i1 true) #6, !dbg !21 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !21 + %41 = extractelement <2 x bfloat> %40, i64 0, !dbg !21 + %42 = extractelement <2 x bfloat> %40, i64 1, !dbg !21 + %43 = fpext bfloat %41 to float, !dbg !22 + %44 = fpext bfloat %42 to float, !dbg !22 + %45 = fmul float %33, %33, !dbg !23 + %46 = fmul float %34, %34, !dbg !23 + %47 = fmul float %43, %43, !dbg !24 + %48 = fmul float %44, %44, !dbg !24 + %49 = or disjoint i32 %18, 64, !dbg !25 + %50 = or disjoint i32 %23, %49, !dbg !15 + %51 = sext i32 %50 to i64, !dbg !16 + %52 = getelementptr bfloat, ptr addrspace(1) %2, i64 %51, !dbg !16 + %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %54 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %52, i64 %53, i1 true) #6, !dbg !17 + %55 = bitcast i32 %54 to <2 x bfloat>, !dbg !17 + %56 = extractelement <2 x bfloat> %55, i64 0, !dbg !17 + %57 = extractelement <2 x bfloat> %55, i64 1, !dbg !17 + %58 = fpext bfloat %56 to float, !dbg !18 + %59 = fpext bfloat %57 to float, !dbg !18 + %60 = or disjoint i32 %22, %49, !dbg !19 + %61 = sext i32 %60 to i64, !dbg !20 + %62 = getelementptr bfloat, ptr addrspace(1) %2, i64 %61, !dbg !20 + %63 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %64 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %62, i64 %63, i1 true) #6, !dbg !21 + %65 = bitcast i32 %64 to <2 x bfloat>, !dbg !21 + %66 = extractelement <2 x bfloat> %65, i64 0, !dbg !21 + %67 = extractelement <2 x bfloat> %65, i64 1, !dbg !21 + %68 = fpext bfloat %66 to float, !dbg !22 + %69 = fpext bfloat %67 to float, !dbg !22 + %70 = fmul float %58, %58, !dbg !23 + %71 = fmul float %59, %59, !dbg !23 + %72 = fadd float %45, %70, !dbg !26 + %73 = fadd float %46, %71, !dbg !26 + %74 = fmul float %68, %68, !dbg !24 + %75 = fmul float %69, %69, !dbg !24 + %76 = fadd float %47, %74, !dbg !27 + %77 = fadd float %48, %75, !dbg !27 + %78 = and i32 %14, 63, !dbg !10 + %.not = icmp eq i32 %15, 0, !dbg !10 + %79 = and i32 %14, 1, !dbg !10 + %.not2 = icmp eq i32 %79, 0, !dbg !10 + %80 = or disjoint i32 %13, %79, !dbg !11 + %81 = and i32 %14, 62, !dbg !12 + %82 = lshr exact i32 %81, 1, !dbg !12 + %83 = sdiv i32 %80, 32, !dbg !13 + %84 = fadd float %72, %73, !dbg !28 + %85 = bitcast float %84 to i32, !dbg !31 + %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 16, i32 31), !dbg !31 + %87 = bitcast i32 %86 to float, !dbg !31 + %88 = fadd float %84, %87, !dbg !28 + %89 = bitcast float %88 to i32, !dbg !31 + %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 8, i32 31), !dbg !31 + %91 = bitcast i32 %90 to float, !dbg !31 + %92 = fadd float %88, %91, !dbg !28 + %93 = bitcast float %92 to i32, !dbg !31 + %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 4, i32 31), !dbg !31 + %95 = bitcast i32 %94 to float, !dbg !31 + %96 = fadd float %92, %95, !dbg !28 + %97 = bitcast float %96 to i32, !dbg !31 + %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 2, i32 31), !dbg !31 + %99 = bitcast i32 %98 to float, !dbg !31 + %100 = fadd float %96, %99, !dbg !28 + %101 = bitcast float %100 to i32, !dbg !31 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 1, i32 31), !dbg !31 + %103 = bitcast i32 %102 to float, !dbg !31 + %104 = fadd float %100, %103, !dbg !28 + %105 = fadd float %76, %77, !dbg !34 + %106 = bitcast float %105 to i32, !dbg !35 + %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !35 + %108 = bitcast i32 %107 to float, !dbg !35 + %109 = fadd float %105, %108, !dbg !34 + %110 = bitcast float %109 to i32, !dbg !35 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 8, i32 31), !dbg !35 + %112 = bitcast i32 %111 to float, !dbg !35 + %113 = fadd float %109, %112, !dbg !34 + %114 = bitcast float %113 to i32, !dbg !35 + %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 4, i32 31), !dbg !35 + %116 = bitcast i32 %115 to float, !dbg !35 + %117 = fadd float %113, %116, !dbg !34 + %118 = bitcast float %117 to i32, !dbg !35 + %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 2, i32 31), !dbg !35 + %120 = bitcast i32 %119 to float, !dbg !35 + %121 = fadd float %117, %120, !dbg !34 + %122 = bitcast float %121 to i32, !dbg !35 + %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 1, i32 31), !dbg !35 + %124 = bitcast i32 %123 to float, !dbg !35 + %125 = fadd float %121, %124, !dbg !34 + %126 = shl i32 %19, 7, !dbg !37 + %127 = tail call float @llvm.nvvm.div.full(float %125, float 1.280000e+02), !dbg !38 + %128 = fadd float %127, 0x3EB0C6F7A0000000, !dbg !39 + %129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %.not.i = icmp eq i32 %129, 0, !dbg !40 + br i1 %.not.i, label %132, label %130, !dbg !40 + +130: ; preds = %11 + %131 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit, !dbg !40 + +132: ; preds = %11 + %133 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit, !dbg !40 + +__nv_rsqrtf.exit: ; preds = %130, %132 + %.0.i = phi float [ %131, %130 ], [ %133, %132 ], !dbg !40 + %134 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %.not.i4 = icmp eq i32 %134, 0, !dbg !40 + br i1 %.not.i4, label %137, label %135, !dbg !40 + +135: ; preds = %__nv_rsqrtf.exit + %136 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit6, !dbg !40 + +137: ; preds = %__nv_rsqrtf.exit + %138 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit6, !dbg !40 + +__nv_rsqrtf.exit6: ; preds = %135, %137 + %.0.i5 = phi float [ %136, %135 ], [ %138, %137 ], !dbg !40 + %139 = lshr exact i32 %15, 3, !dbg !41 + %140 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %139, !dbg !41 + store float %.0.i, ptr addrspace(3) %140, align 4, !dbg !41 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41 + %141 = shl nuw nsw i32 %79, 2, !dbg !41 + %142 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %141, !dbg !41 + %143 = load float, ptr addrspace(3) %142, align 4, !dbg !41 + %144 = tail call float @llvm.nvvm.div.full(float %104, float 1.280000e+02), !dbg !42 + %145 = fadd float %144, 0x3EB0C6F7A0000000, !dbg !43 + %146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i7 = icmp eq i32 %146, 0, !dbg !44 + br i1 %.not.i7, label %149, label %147, !dbg !44 + +147: ; preds = %__nv_rsqrtf.exit6 + %148 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit9, !dbg !44 + +149: ; preds = %__nv_rsqrtf.exit6 + %150 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit9, !dbg !44 + +__nv_rsqrtf.exit9: ; preds = %147, %149 + %.0.i8 = phi float [ %148, %147 ], [ %150, %149 ], !dbg !44 + %151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i10 = icmp eq i32 %151, 0, !dbg !44 + br i1 %.not.i10, label %154, label %152, !dbg !44 + +152: ; preds = %__nv_rsqrtf.exit9 + %153 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit12, !dbg !44 + +154: ; preds = %__nv_rsqrtf.exit9 + %155 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit12, !dbg !44 + +__nv_rsqrtf.exit12: ; preds = %152, %154 + %.0.i11 = phi float [ %153, %152 ], [ %155, %154 ], !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + store float %.0.i8, ptr addrspace(3) %140, align 4, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %156 = load float, ptr addrspace(3) %142, align 4, !dbg !45 + %157 = shl i32 %16, 7, !dbg !46 + %158 = and i32 %82, 1 + %.masked = and i32 %82, 30 + %159 = shl nuw nsw i32 %14, 3 + %160 = and i32 %159, 120 + %161 = and i32 %14, 16 + %162 = lshr exact i32 %161, 2 + %163 = select i1 %.not, i32 0, i32 192 + %164 = xor i32 %163, %160 + %165 = or disjoint i32 %164, %162 + %166 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %165 + %167 = xor i32 %165, 260 + %168 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %167 + %169 = shl nuw nsw i32 %14, 1 + %170 = and i32 %169, 120 + %171 = select i1 %.not2, i32 0, i32 192 + %172 = and i32 %14, 2 + %173 = icmp eq i32 %172, 0 + %174 = select i1 %173, i32 0, i32 260 + %175 = xor i32 %171, %170 + %176 = or disjoint i32 %175, %174 + %177 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %176 + %178 = xor i32 %176, 4 + %179 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %178 + %180 = icmp eq i32 %158, 0 + %181 = shl i32 %80, 7 + %182 = shl i32 %83, 15 + %183 = add i32 %182, %181 + %184 = icmp ne i32 %158, 0 + %185 = shl nuw nsw i32 %81, 1 + %186 = xor i32 %171, %185 + %187 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %186 + %188 = shl nuw nsw i32 %17, 2 + %189 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %188 + %190 = xor i32 %188, 192 + %191 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %190 + %192 = add i32 %183, 4097 + %193 = add i32 %183, 4096 + %194 = shl nuw nsw i32 %78, 2 + %195 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %194 + %196 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %185 + %197 = getelementptr inbounds nuw i8, ptr addrspace(3) %196, i32 128 + %198 = and i32 %169, 60 + %199 = lshr exact i32 %15, 4 + %200 = or disjoint i32 %198, %199 + %201 = or disjoint i32 %200, %171 + %202 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %201 + %203 = xor i32 %201, 64 + %204 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %203 + %205 = and i32 %159, 56 + %206 = lshr i32 %14, 2 + %207 = and i32 %206, 2 + %208 = shl nuw nsw i32 %161, 2 + %209 = or disjoint i32 %205, %208 + %210 = xor i32 %209, %163 + %211 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %207 + %212 = getelementptr inbounds nuw i8, ptr addrspace(3) %211, i32 %210 + %213 = getelementptr inbounds nuw i8, ptr addrspace(3) %212, i32 4 + %214 = zext nneg i32 %.masked to i64, !dbg !47 + %215 = zext nneg i32 %78 to i64, !dbg !47 + %216 = sext i32 %126 to i64, !dbg !47 + %217 = sext i32 %157 to i64, !dbg !47 + br label %218, !dbg !47 + +218: ; preds = %__nv_rsqrtf.exit12, %218 + %219 = phi i1 [ true, %__nv_rsqrtf.exit12 ], [ false, %218 ] + %indvars.iv = phi i64 [ 0, %__nv_rsqrtf.exit12 ], [ 64, %218 ] + %220 = or disjoint i64 %indvars.iv, %24, !dbg !48 + %221 = or disjoint i64 %indvars.iv, %215, !dbg !48 + %222 = or disjoint i64 %indvars.iv, %214, !dbg !49 + %223 = or disjoint i64 %222, 32, !dbg !49 + %224 = trunc nuw nsw i64 %220 to i32, !dbg !50 + %225 = or disjoint i32 %22, %224, !dbg !50 + %226 = sext i32 %225 to i64, !dbg !51 + %227 = getelementptr bfloat, ptr addrspace(1) %2, i64 %226, !dbg !51 + %228 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !52 + %229 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %227, i64 %228, i1 true) #6, !dbg !52 + %230 = bitcast i32 %229 to <2 x bfloat>, !dbg !52 + %231 = extractelement <2 x bfloat> %230, i64 0, !dbg !52 + %232 = extractelement <2 x bfloat> %230, i64 1, !dbg !52 + %233 = fpext bfloat %231 to float, !dbg !53 + %234 = fpext bfloat %232 to float, !dbg !53 + %235 = getelementptr bfloat, ptr addrspace(1) %3, i64 %221, !dbg !54 + %236 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %237 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %235, i64 %236, i1 true) #6, !dbg !55 + %238 = bitcast i16 %237 to bfloat, !dbg !55 + %239 = fpext bfloat %238 to float, !dbg !56 + %240 = or disjoint i64 %220, %216, !dbg !57 + %241 = getelementptr float, ptr addrspace(1) %4, i64 %240, !dbg !58 + %242 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !59 + %243 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %241, i64 %242, i1 true) #6, !dbg !59 + %244 = extractvalue { i32, i32 } %243, 0, !dbg !59 + %245 = extractvalue { i32, i32 } %243, 1, !dbg !59 + %246 = bitcast i32 %244 to float, !dbg !59 + %247 = bitcast i32 %245 to float, !dbg !59 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59 + %248 = insertelement <1 x i32> poison, i32 %244, i64 0, !dbg !59 + store <1 x i32> %248, ptr addrspace(3) %166, align 4, !dbg !59 + %249 = insertelement <1 x i32> poison, i32 %245, i64 0, !dbg !59 + store <1 x i32> %249, ptr addrspace(3) %168, align 4, !dbg !59 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59 + %250 = load float, ptr addrspace(3) %177, align 4, !dbg !59 + %251 = load float, ptr addrspace(3) %179, align 4, !dbg !59 + %252 = getelementptr float, ptr addrspace(1) %5, i64 %240, !dbg !60 + %253 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !61 + %254 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %252, i64 %253, i1 true) #6, !dbg !61 + %255 = extractvalue { i32, i32 } %254, 0, !dbg !61 + %256 = extractvalue { i32, i32 } %254, 1, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %257 = insertelement <1 x i32> poison, i32 %255, i64 0, !dbg !61 + store <1 x i32> %257, ptr addrspace(3) %166, align 4, !dbg !61 + %258 = insertelement <1 x i32> poison, i32 %256, i64 0, !dbg !61 + store <1 x i32> %258, ptr addrspace(3) %168, align 4, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %259 = load float, ptr addrspace(3) %177, align 4, !dbg !61 + %260 = load float, ptr addrspace(3) %179, align 4, !dbg !61 + %261 = or disjoint i32 %23, %224, !dbg !62 + %262 = sext i32 %261 to i64, !dbg !63 + %263 = getelementptr bfloat, ptr addrspace(1) %2, i64 %262, !dbg !63 + %264 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %265 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %263, i64 %264, i1 true) #6, !dbg !64 + %266 = bitcast i32 %265 to <2 x bfloat>, !dbg !64 + %267 = extractelement <2 x bfloat> %266, i64 0, !dbg !64 + %268 = extractelement <2 x bfloat> %266, i64 1, !dbg !64 + %269 = fpext bfloat %267 to float, !dbg !65 + %270 = fpext bfloat %268 to float, !dbg !65 + %271 = getelementptr bfloat, ptr addrspace(1) %6, i64 %221, !dbg !66 + %272 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !67 + %273 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %271, i64 %272, i1 true) #6, !dbg !67 + %274 = bitcast i16 %273 to bfloat, !dbg !67 + %275 = fpext bfloat %274 to float, !dbg !68 + %276 = or disjoint i64 %222, 1, !dbg !69 + %277 = or disjoint i64 %222, 33, !dbg !69 + %278 = trunc nuw nsw i64 %276 to i32, !dbg !70 + %279 = or disjoint i32 %183, %278, !dbg !70 + %280 = trunc nuw nsw i64 %277 to i32, !dbg !70 + %281 = or disjoint i32 %183, %280, !dbg !70 + %282 = sext i32 %279 to i64, !dbg !71 + %283 = getelementptr bfloat, ptr addrspace(1) %2, i64 %282, !dbg !71 + %284 = sext i32 %281 to i64, !dbg !71 + %285 = getelementptr bfloat, ptr addrspace(1) %2, i64 %284, !dbg !71 + %286 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %287 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %283, i64 %286, i1 %180) #6, !dbg !72 + %288 = bitcast i16 %287 to bfloat, !dbg !72 + %289 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %290 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %285, i64 %289, i1 %180) #6, !dbg !72 + %291 = bitcast i16 %290 to bfloat, !dbg !72 + %292 = fpext bfloat %288 to float, !dbg !73 + %293 = fpext bfloat %291 to float, !dbg !73 + %294 = fmul float %143, %292, !dbg !41 + %295 = fmul float %143, %293, !dbg !41 + %296 = getelementptr bfloat, ptr addrspace(1) %3, i64 %276, !dbg !74 + %297 = getelementptr bfloat, ptr addrspace(1) %3, i64 %277, !dbg !74 + %298 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %299 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %296, i64 %298, i1 %180) #6, !dbg !75 + %300 = bitcast i16 %299 to bfloat, !dbg !75 + %301 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %302 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %297, i64 %301, i1 %180) #6, !dbg !75 + %303 = bitcast i16 %302 to bfloat, !dbg !75 + %304 = fpext bfloat %300 to float, !dbg !76 + %305 = fpext bfloat %303 to float, !dbg !76 + %306 = fmul float %294, %304, !dbg !77 + %307 = fmul float %295, %305, !dbg !77 + %308 = fsub float 0.000000e+00, %306, !dbg !78 + %309 = fsub float 0.000000e+00, %307, !dbg !78 + %310 = trunc nuw nsw i64 %222 to i32, !dbg !79 + %311 = or disjoint i32 %183, %310, !dbg !79 + %312 = trunc nuw nsw i64 %223 to i32, !dbg !79 + %313 = or disjoint i32 %183, %312, !dbg !79 + %314 = sext i32 %311 to i64, !dbg !80 + %315 = getelementptr bfloat, ptr addrspace(1) %2, i64 %314, !dbg !80 + %316 = sext i32 %313 to i64, !dbg !80 + %317 = getelementptr bfloat, ptr addrspace(1) %2, i64 %316, !dbg !80 + %318 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %319 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %315, i64 %318, i1 %184) #6, !dbg !81 + %320 = bitcast i16 %319 to bfloat, !dbg !81 + %321 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %322 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %317, i64 %321, i1 %184) #6, !dbg !81 + %323 = bitcast i16 %322 to bfloat, !dbg !81 + %324 = fpext bfloat %320 to float, !dbg !82 + %325 = fpext bfloat %323 to float, !dbg !82 + %326 = fmul float %143, %324, !dbg !83 + %327 = fmul float %143, %325, !dbg !83 + %328 = getelementptr bfloat, ptr addrspace(1) %3, i64 %222, !dbg !84 + %329 = getelementptr bfloat, ptr addrspace(1) %3, i64 %223, !dbg !84 + %330 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %331 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %328, i64 %330, i1 %184) #6, !dbg !85 + %332 = bitcast i16 %331 to bfloat, !dbg !85 + %333 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %334 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %329, i64 %333, i1 %184) #6, !dbg !85 + %335 = bitcast i16 %334 to bfloat, !dbg !85 + %336 = fpext bfloat %332 to float, !dbg !86 + %337 = fpext bfloat %335 to float, !dbg !86 + %338 = fmul float %326, %336, !dbg !87 + %339 = fmul float %327, %337, !dbg !87 + %340 = select i1 %180, float %308, float %338, !dbg !88 + %341 = select i1 %180, float %309, float %339, !dbg !88 + %342 = fmul float %.0.i5, %233, !dbg !89 + %343 = fmul float %.0.i5, %234, !dbg !89 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !90 + store float %239, ptr addrspace(3) %187, align 4, !dbg !90 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !90 + %344 = load float, ptr addrspace(3) %189, align 4, !dbg !90 + %345 = load float, ptr addrspace(3) %191, align 4, !dbg !90 + %346 = fmul float %342, %344, !dbg !90 + %347 = fmul float %343, %345, !dbg !90 + %348 = fmul float %346, %246, !dbg !91 + %349 = fmul float %347, %247, !dbg !91 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91 + store float %348, ptr addrspace(3) %166, align 4, !dbg !91 + store float %349, ptr addrspace(3) %168, align 4, !dbg !91 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91 + %350 = load float, ptr addrspace(3) %177, align 4, !dbg !91 + %351 = load float, ptr addrspace(3) %179, align 4, !dbg !91 + %352 = fmul float %259, %340, !dbg !92 + %353 = fmul float %260, %341, !dbg !92 + %354 = fadd float %352, %350, !dbg !93 + %355 = fadd float %353, %351, !dbg !93 + %356 = or disjoint i32 %192, %310, !dbg !94 + %357 = or disjoint i32 %192, %312, !dbg !94 + %358 = sext i32 %356 to i64, !dbg !95 + %359 = getelementptr bfloat, ptr addrspace(1) %2, i64 %358, !dbg !95 + %360 = sext i32 %357 to i64, !dbg !95 + %361 = getelementptr bfloat, ptr addrspace(1) %2, i64 %360, !dbg !95 + %362 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %363 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %359, i64 %362, i1 %180) #6, !dbg !96 + %364 = bitcast i16 %363 to bfloat, !dbg !96 + %365 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %366 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %361, i64 %365, i1 %180) #6, !dbg !96 + %367 = bitcast i16 %366 to bfloat, !dbg !96 + %368 = fpext bfloat %364 to float, !dbg !97 + %369 = fpext bfloat %367 to float, !dbg !97 + %370 = fmul float %156, %368, !dbg !45 + %371 = fmul float %156, %369, !dbg !45 + %372 = getelementptr bfloat, ptr addrspace(1) %6, i64 %276, !dbg !98 + %373 = getelementptr bfloat, ptr addrspace(1) %6, i64 %277, !dbg !98 + %374 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %375 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %372, i64 %374, i1 %180) #6, !dbg !99 + %376 = bitcast i16 %375 to bfloat, !dbg !99 + %377 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %378 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %377, i1 %180) #6, !dbg !99 + %379 = bitcast i16 %378 to bfloat, !dbg !99 + %380 = fpext bfloat %376 to float, !dbg !100 + %381 = fpext bfloat %379 to float, !dbg !100 + %382 = fmul float %370, %380, !dbg !101 + %383 = fmul float %371, %381, !dbg !101 + %384 = fsub float 0.000000e+00, %382, !dbg !102 + %385 = fsub float 0.000000e+00, %383, !dbg !102 + %386 = or disjoint i32 %193, %310, !dbg !103 + %387 = or disjoint i32 %193, %312, !dbg !103 + %388 = sext i32 %386 to i64, !dbg !104 + %389 = getelementptr bfloat, ptr addrspace(1) %2, i64 %388, !dbg !104 + %390 = sext i32 %387 to i64, !dbg !104 + %391 = getelementptr bfloat, ptr addrspace(1) %2, i64 %390, !dbg !104 + %392 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %393 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %389, i64 %392, i1 %184) #6, !dbg !105 + %394 = bitcast i16 %393 to bfloat, !dbg !105 + %395 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %396 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %391, i64 %395, i1 %184) #6, !dbg !105 + %397 = bitcast i16 %396 to bfloat, !dbg !105 + %398 = fpext bfloat %394 to float, !dbg !106 + %399 = fpext bfloat %397 to float, !dbg !106 + %400 = fmul float %156, %398, !dbg !107 + %401 = fmul float %156, %399, !dbg !107 + %402 = getelementptr bfloat, ptr addrspace(1) %6, i64 %222, !dbg !108 + %403 = getelementptr bfloat, ptr addrspace(1) %6, i64 %223, !dbg !108 + %404 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %405 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %402, i64 %404, i1 %184) #6, !dbg !109 + %406 = bitcast i16 %405 to bfloat, !dbg !109 + %407 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %408 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %403, i64 %407, i1 %184) #6, !dbg !109 + %409 = bitcast i16 %408 to bfloat, !dbg !109 + %410 = fpext bfloat %406 to float, !dbg !110 + %411 = fpext bfloat %409 to float, !dbg !110 + %412 = fmul float %400, %410, !dbg !111 + %413 = fmul float %401, %411, !dbg !111 + %414 = select i1 %180, float %384, float %412, !dbg !88 + %415 = select i1 %180, float %385, float %413, !dbg !88 + %416 = fmul float %.0.i11, %269, !dbg !112 + %417 = fmul float %.0.i11, %270, !dbg !112 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112 + store float %416, ptr addrspace(3) %166, align 4, !dbg !112 + store float %417, ptr addrspace(3) %168, align 4, !dbg !112 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112 + %418 = load float, ptr addrspace(3) %177, align 4, !dbg !112 + %419 = load float, ptr addrspace(3) %179, align 4, !dbg !112 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + store float %275, ptr addrspace(3) %195, align 4, !dbg !113 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + %420 = load float, ptr addrspace(3) %196, align 4, !dbg !113 + %421 = load float, ptr addrspace(3) %197, align 4, !dbg !113 + %422 = fmul float %418, %420, !dbg !114 + %423 = fmul float %419, %421, !dbg !114 + %424 = fmul float %250, %422, !dbg !113 + %425 = fmul float %251, %423, !dbg !113 + %426 = fmul float %259, %414, !dbg !115 + %427 = fmul float %260, %415, !dbg !115 + %428 = fadd float %426, %424, !dbg !116 + %429 = fadd float %427, %425, !dbg !116 + %430 = or disjoint i64 %220, %217, !dbg !117 + %431 = getelementptr bfloat, ptr addrspace(1) %0, i64 %430, !dbg !118 + %432 = fptrunc float %354 to bfloat, !dbg !119 + %433 = fptrunc float %355 to bfloat, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + store bfloat %432, ptr addrspace(3) %202, align 2, !dbg !119 + store bfloat %433, ptr addrspace(3) %204, align 2, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + %434 = load bfloat, ptr addrspace(3) %212, align 2, !dbg !119 + %435 = load bfloat, ptr addrspace(3) %213, align 2, !dbg !119 + %436 = insertelement <2 x bfloat> poison, bfloat %434, i64 0, !dbg !119 + %437 = insertelement <2 x bfloat> %436, bfloat %435, i64 1, !dbg !119 + %438 = bitcast <2 x bfloat> %437 to i32, !dbg !119 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %438, ptr addrspace(1) %431, i1 true) #6, !dbg !119 + %439 = getelementptr bfloat, ptr addrspace(1) %1, i64 %430, !dbg !120 + %440 = fptrunc float %428 to bfloat, !dbg !121 + %441 = fptrunc float %429 to bfloat, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + store bfloat %440, ptr addrspace(3) %202, align 2, !dbg !121 + store bfloat %441, ptr addrspace(3) %204, align 2, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + %442 = load bfloat, ptr addrspace(3) %212, align 2, !dbg !121 + %443 = load bfloat, ptr addrspace(3) %213, align 2, !dbg !121 + %444 = insertelement <2 x bfloat> poison, bfloat %442, i64 0, !dbg !121 + %445 = insertelement <2 x bfloat> %444, bfloat %443, i64 1, !dbg !121 + %446 = bitcast <2 x bfloat> %445 to i32, !dbg !121 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %446, ptr addrspace(1) %439, i1 true) #6, !dbg !121 + br i1 %219, label %218, label %447, !dbg !47 + +447: ; preds = %218 + ret void, !dbg !122 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 23, column: 33, scope: !5) +!10 = !DILocation(line: 24, column: 44, scope: !5) +!11 = !DILocation(line: 24, column: 23, scope: !5) +!12 = !DILocation(line: 26, column: 37, scope: !5) +!13 = !DILocation(line: 29, column: 19, scope: !5) +!14 = !DILocation(line: 33, column: 43, scope: !5) +!15 = !DILocation(line: 39, column: 57, scope: !5) +!16 = !DILocation(line: 39, column: 34, scope: !5) +!17 = !DILocation(line: 39, column: 68, scope: !5) +!18 = !DILocation(line: 39, column: 121, scope: !5) +!19 = !DILocation(line: 40, column: 50, scope: !5) +!20 = !DILocation(line: 40, column: 34, scope: !5) +!21 = !DILocation(line: 40, column: 61, scope: !5) +!22 = !DILocation(line: 40, column: 114, scope: !5) +!23 = !DILocation(line: 42, column: 22, scope: !5) +!24 = !DILocation(line: 47, column: 22, scope: !5) +!25 = !DILocation(line: 34, column: 31, scope: !5) +!26 = !DILocation(line: 44, column: 23, scope: !5) +!27 = !DILocation(line: 49, column: 25, scope: !5) +!28 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !31) +!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0) +!30 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!31 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !32) +!32 = !DILocation(line: 51, column: 25, scope: !33) +!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!34 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !35) +!35 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !36) +!36 = !DILocation(line: 52, column: 27, scope: !33) +!37 = !DILocation(line: 63, column: 46, scope: !5) +!38 = !DILocation(line: 75, column: 25, scope: !5) +!39 = !DILocation(line: 77, column: 24, scope: !5) +!40 = !DILocation(line: 78, column: 32, scope: !5) +!41 = !DILocation(line: 79, column: 24, scope: !5) +!42 = !DILocation(line: 123, column: 24, scope: !5) +!43 = !DILocation(line: 124, column: 24, scope: !5) +!44 = !DILocation(line: 125, column: 32, scope: !5) +!45 = !DILocation(line: 126, column: 24, scope: !5) +!46 = !DILocation(line: 161, column: 43, scope: !5) +!47 = !DILocation(line: 53, column: 43, scope: !5) +!48 = !DILocation(line: 54, column: 31, scope: !5) +!49 = !DILocation(line: 72, column: 41, scope: !5) +!50 = !DILocation(line: 61, column: 51, scope: !5) +!51 = !DILocation(line: 61, column: 35, scope: !5) +!52 = !DILocation(line: 61, column: 62, scope: !5) +!53 = !DILocation(line: 61, column: 115, scope: !5) +!54 = !DILocation(line: 62, column: 35, scope: !5) +!55 = !DILocation(line: 62, column: 42, scope: !5) +!56 = !DILocation(line: 62, column: 95, scope: !5) +!57 = !DILocation(line: 63, column: 42, scope: !5) +!58 = !DILocation(line: 63, column: 35, scope: !5) +!59 = !DILocation(line: 63, column: 51, scope: !5) +!60 = !DILocation(line: 64, column: 35, scope: !5) +!61 = !DILocation(line: 64, column: 51, scope: !5) +!62 = !DILocation(line: 65, column: 58, scope: !5) +!63 = !DILocation(line: 65, column: 35, scope: !5) +!64 = !DILocation(line: 65, column: 69, scope: !5) +!65 = !DILocation(line: 65, column: 123, scope: !5) +!66 = !DILocation(line: 66, column: 36, scope: !5) +!67 = !DILocation(line: 66, column: 43, scope: !5) +!68 = !DILocation(line: 66, column: 96, scope: !5) +!69 = !DILocation(line: 72, column: 39, scope: !5) +!70 = !DILocation(line: 72, column: 57, scope: !5) +!71 = !DILocation(line: 72, column: 35, scope: !5) +!72 = !DILocation(line: 72, column: 68, scope: !5) +!73 = !DILocation(line: 72, column: 129, scope: !5) +!74 = !DILocation(line: 80, column: 35, scope: !5) +!75 = !DILocation(line: 80, column: 85, scope: !5) +!76 = !DILocation(line: 80, column: 146, scope: !5) +!77 = !DILocation(line: 82, column: 24, scope: !5) +!78 = !DILocation(line: 84, column: 17, scope: !5) +!79 = !DILocation(line: 90, column: 53, scope: !5) +!80 = !DILocation(line: 90, column: 35, scope: !5) +!81 = !DILocation(line: 90, column: 64, scope: !5) +!82 = !DILocation(line: 90, column: 125, scope: !5) +!83 = !DILocation(line: 97, column: 24, scope: !5) +!84 = !DILocation(line: 98, column: 35, scope: !5) +!85 = !DILocation(line: 98, column: 81, scope: !5) +!86 = !DILocation(line: 98, column: 142, scope: !5) +!87 = !DILocation(line: 100, column: 24, scope: !5) +!88 = !DILocation(line: 0, scope: !5) +!89 = !DILocation(line: 111, column: 24, scope: !5) +!90 = !DILocation(line: 113, column: 24, scope: !5) +!91 = !DILocation(line: 116, column: 24, scope: !5) +!92 = !DILocation(line: 118, column: 24, scope: !5) +!93 = !DILocation(line: 119, column: 24, scope: !5) +!94 = !DILocation(line: 121, column: 60, scope: !5) +!95 = !DILocation(line: 121, column: 35, scope: !5) +!96 = !DILocation(line: 121, column: 71, scope: !5) +!97 = !DILocation(line: 121, column: 132, scope: !5) +!98 = !DILocation(line: 127, column: 35, scope: !5) +!99 = !DILocation(line: 127, column: 85, scope: !5) +!100 = !DILocation(line: 127, column: 146, scope: !5) +!101 = !DILocation(line: 129, column: 24, scope: !5) +!102 = !DILocation(line: 131, column: 17, scope: !5) +!103 = !DILocation(line: 134, column: 60, scope: !5) +!104 = !DILocation(line: 134, column: 35, scope: !5) +!105 = !DILocation(line: 134, column: 71, scope: !5) +!106 = !DILocation(line: 134, column: 132, scope: !5) +!107 = !DILocation(line: 139, column: 24, scope: !5) +!108 = !DILocation(line: 140, column: 35, scope: !5) +!109 = !DILocation(line: 140, column: 81, scope: !5) +!110 = !DILocation(line: 140, column: 142, scope: !5) +!111 = !DILocation(line: 142, column: 24, scope: !5) +!112 = !DILocation(line: 151, column: 25, scope: !5) +!113 = !DILocation(line: 156, column: 26, scope: !5) +!114 = !DILocation(line: 153, column: 26, scope: !5) +!115 = !DILocation(line: 158, column: 26, scope: !5) +!116 = !DILocation(line: 159, column: 26, scope: !5) +!117 = !DILocation(line: 161, column: 39, scope: !5) +!118 = !DILocation(line: 161, column: 32, scope: !5) +!119 = !DILocation(line: 161, column: 55, scope: !5) +!120 = !DILocation(line: 162, column: 32, scope: !5) +!121 = !DILocation(line: 162, column: 56, scope: !5) +!122 = !DILocation(line: 53, column: 4, scope: !5) diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0d1083c2cdb52c38c0c6488593c7c8ea82c98b59 --- /dev/null +++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx @@ -0,0 +1,1211 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10 +) +.reqntid 64 +{ + .reg .pred %p<6>; + .reg .b16 %rs<40>; + .reg .b32 %r<227>; + .reg .b64 %rd<98>; + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd12, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6]; + ld.param.b64 %rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5]; + ld.param.b64 %rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4]; + ld.param.b64 %rd9, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3]; + ld.param.b64 %rd8, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2]; + ld.param.b64 %rd7, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1]; + ld.param.b64 %rd6, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28 + mov.u32 %r28, %ctaid.x; + .loc 1 23 33 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33 + shl.b32 %r29, %r28, 1; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + mov.u32 %r30, %tid.x; + bfe.s32 %r31, %r30, 5, 1; + and.b32 %r32, %r30, 32; + bfe.u32 %r33, %r30, 5, 1; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r34, %r33, %r29; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + and.b32 %r35, %r30, 31; + shl.b32 %r36, %r35, 1; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + bfe.s32 %r37, %r28, 30, 1; + shr.u32 %r38, %r37, 27; + add.s32 %r39, %r34, %r38; + shr.s32 %r40, %r39, 5; + shl.b32 %r41, %r34, 7; + shl.b32 %r42, %r40, 15; + add.s32 %r1, %r42, %r41; + add.s32 %r2, %r1, 4096; + .loc 1 33 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43 + cvt.u64.u32 %rd1, %r36; + .loc 1 39 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57 + or.b32 %r43, %r2, %r36; + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + mad.wide.s32 %rd13, %r43, 2, %rd8; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd14, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0; + // end inline asm + mov.b32 %r24, 0; + mov.pred %p2, -1; + // begin inline asm + mov.u32 %r23, %r24; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r23 }, [ %rd13 + 0 ], %rd14; + // end inline asm + mov.b32 {%rs1, %rs2}, %r23; + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r44, %rs1; + cvt.f32.bf16 %r45, %rs2; + .loc 1 40 50 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:50 + or.b32 %r46, %r1, %r36; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + mad.wide.s32 %rd15, %r46, 2, %rd8; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd16, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r24; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r25 }, [ %rd15 + 0 ], %rd16; + // end inline asm + mov.b32 {%rs3, %rs4}, %r25; + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r47, %rs3; + cvt.f32.bf16 %r48, %rs4; + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + cvt.s64.s32 %rd21, %r2; + or.b64 %rd22, %rd21, %rd1; + shl.b64 %rd23, %rd22, 1; + add.s64 %rd24, %rd8, %rd23; + add.s64 %rd17, %rd24, 128; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r26, %r24; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r26 }, [ %rd17 + 0 ], %rd18; + // end inline asm + mov.b32 {%rs5, %rs6}, %r26; + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r49, %rs5; + cvt.f32.bf16 %r50, %rs6; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + cvt.s64.s32 %rd25, %r1; + or.b64 %rd26, %rd25, %rd1; + shl.b64 %rd27, %rd26, 1; + add.s64 %rd28, %rd8, %rd27; + add.s64 %rd19, %rd28, 128; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r27, %r24; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r27 }, [ %rd19 + 0 ], %rd20; + // end inline asm + mov.b32 {%rs7, %rs8}, %r27; + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r51, %rs7; + cvt.f32.bf16 %r52, %rs8; + .loc 1 42 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22 + mul.f32 %r53, %r49, %r49; + mul.f32 %r54, %r50, %r50; + .loc 1 44 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:44:23 + fma.rn.f32 %r55, %r44, %r44, %r53; + fma.rn.f32 %r56, %r45, %r45, %r54; + .loc 1 47 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22 + mul.f32 %r57, %r51, %r51; + mul.f32 %r58, %r52, %r52; + .loc 1 49 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25 + fma.rn.f32 %r59, %r47, %r47, %r57; + fma.rn.f32 %r60, %r48, %r48, %r58; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + and.b32 %r61, %r30, 63; + and.b32 %r62, %r30, 1; + neg.s32 %r63, %r62; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r64, %r29, %r62; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + and.b32 %r65, %r30, 62; + bfe.u32 %r66, %r30, 1, 5; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + add.s32 %r67, %r64, %r38; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r68, %r55, %r56; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r69, %r68, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r70, %r68, %r69; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r71, %r70, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r72, %r70, %r71; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r73, %r72, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r74, %r72, %r73; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r75, %r74, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r76, %r74, %r75; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r77, %r76, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r78, %r76, %r77; +$L__tmp12: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r79, %r59, %r60; +$L__tmp13: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r80, %r79, 16, 31, -1; +$L__tmp14: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r81, %r79, %r80; +$L__tmp15: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r82, %r81, 8, 31, -1; +$L__tmp16: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r83, %r81, %r82; +$L__tmp17: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r84, %r83, 4, 31, -1; +$L__tmp18: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r85, %r83, %r84; +$L__tmp19: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r86, %r85, 2, 31, -1; +$L__tmp20: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r87, %r85, %r86; +$L__tmp21: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r88, %r87, 1, 31, -1; +$L__tmp22: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r89, %r87, %r88; +$L__tmp23: + .loc 1 63 46 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46 + shl.b32 %r90, %r40, 7; + mov.b32 %r91, 0f43000000; + .loc 1 75 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25 + div.full.f32 %r92, %r89, %r91; + .loc 1 77 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24 + add.f32 %r93, %r92, 0f358637BD; + .loc 1 78 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32 + rsqrt.approx.ftz.f32 %r3, %r93; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + shr.u32 %r94, %r32, 3; + mov.b32 %r95, global_smem; + add.s32 %r96, %r95, %r94; + st.shared.b32 [%r96], %r3; + bar.sync 0; + shl.b32 %r97, %r62, 2; + add.s32 %r98, %r95, %r97; + ld.shared.b32 %r4, [%r98]; + .loc 1 123 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24 + div.full.f32 %r99, %r78, %r91; + .loc 1 124 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24 + add.f32 %r100, %r99, 0f358637BD; + .loc 1 125 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32 + rsqrt.approx.ftz.f32 %r5, %r100; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + bar.sync 0; + st.shared.b32 [%r96], %r5; + bar.sync 0; + ld.shared.b32 %r6, [%r98]; + bfe.u32 %r7, %r65, 1, 1; + and.b32 %r101, %r66, 30; + shl.b32 %r102, %r30, 3; + and.b32 %r103, %r102, 120; + and.b32 %r104, %r30, 16; + shr.u32 %r105, %r104, 2; + and.b32 %r106, %r31, 192; + xor.b32 %r107, %r106, %r103; + or.b32 %r108, %r107, %r105; + add.s32 %r8, %r95, %r108; + xor.b32 %r109, %r108, 4; + add.s32 %r9, %r95, %r109; + shl.b32 %r110, %r30, 1; + and.b32 %r111, %r110, 120; + and.b32 %r112, %r63, 192; + bfe.s32 %r113, %r30, 1, 1; + and.b32 %r114, %r113, 260; + xor.b32 %r115, %r112, %r111; + or.b32 %r116, %r115, %r114; + add.s32 %r10, %r95, %r116; + xor.b32 %r117, %r116, 4; + add.s32 %r11, %r95, %r117; + shl.b32 %r118, %r64, 7; + shl.b32 %r119, %r67, 10; + and.b32 %r120, %r119, -32768; + add.s32 %r12, %r120, %r118; + shl.b32 %r121, %r65, 1; + xor.b32 %r122, %r112, %r121; + add.s32 %r13, %r95, %r122; + shl.b32 %r123, %r35, 2; + add.s32 %r14, %r95, %r123; + xor.b32 %r124, %r123, 64; + add.s32 %r15, %r95, %r124; + add.s32 %r16, %r12, 4097; + add.s32 %r17, %r12, 4096; + shl.b32 %r125, %r61, 2; + add.s32 %r18, %r95, %r125; + add.s32 %r19, %r95, %r121; + and.b32 %r126, %r110, 60; + shr.u32 %r127, %r32, 4; + or.b32 %r128, %r126, %r127; + or.b32 %r129, %r128, %r112; + add.s32 %r20, %r95, %r129; + xor.b32 %r130, %r129, 64; + add.s32 %r21, %r95, %r130; + and.b32 %r131, %r102, 56; + shr.u32 %r132, %r30, 2; + and.b32 %r133, %r132, 2; + shl.b32 %r134, %r104, 2; + or.b32 %r135, %r131, %r134; + xor.b32 %r136, %r135, %r106; + add.s32 %r137, %r95, %r133; + add.s32 %r22, %r137, %r136; + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + cvt.u64.u32 %rd2, %r101; + cvt.u64.u32 %rd3, %r61; + cvt.s64.s32 %rd4, %r90; + cvt.s64.s32 %rd5, %r41; + mov.b64 %rd97, 0; + mov.pred %p5, %p2; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 0 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43 + mov.pred %p1, %p5; + setp.ne.b32 %p4, %r7, 0; + setp.eq.b32 %p3, %r7, 0; + .loc 1 54 31 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:54:31 + or.b64 %rd75, %rd97, %rd1; + or.b64 %rd76, %rd97, %rd3; + .loc 1 72 41 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41 + or.b64 %rd77, %rd97, %rd2; + .loc 1 61 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:51 + cvt.u32.u64 %r146, %rd75; + or.b32 %r147, %r1, %r146; + .loc 1 61 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35 + mad.wide.s32 %rd30, %r147, 2, %rd8; + .loc 1 61 62 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62 + // begin inline asm + mov.u64 %rd29, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r138, %r24; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r138 }, [ %rd30 + 0 ], %rd29; + // end inline asm + mov.b32 {%rs28, %rs29}, %r138; + .loc 1 61 115 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115 + cvt.f32.bf16 %r148, %rs28; + cvt.f32.bf16 %r149, %rs29; + .loc 1 62 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35 + shl.b64 %rd78, %rd76, 1; + add.s64 %rd32, %rd9, %rd78; + .loc 1 62 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0; + // end inline asm + mov.b16 %rs10, 0; + // begin inline asm + mov.u16 %rs9, %rs10; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd32 + 0 ], %rd31; + // end inline asm + .loc 1 62 95 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95 + cvt.f32.bf16 %r150, %rs9; + .loc 1 63 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42 + or.b64 %rd79, %rd75, %rd4; + .loc 1 63 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35 + shl.b64 %rd80, %rd79, 2; + add.s64 %rd34, %rd10, %rd80; + .loc 1 63 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51 + // begin inline asm + mov.u64 %rd33, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r139, %r24; + mov.u32 %r140, %r24; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r139, %r140 }, [ %rd34 + 0 ], %rd33; + // end inline asm + bar.sync 0; + st.shared.b32 [%r8], %r139; + st.shared.b32 [%r9+256], %r140; + bar.sync 0; + ld.shared.b32 %r151, [%r10]; + ld.shared.b32 %r152, [%r11]; + .loc 1 64 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35 + add.s64 %rd36, %rd11, %rd80; + .loc 1 64 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51 + // begin inline asm + mov.u64 %rd35, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r141, %r24; + mov.u32 %r142, %r24; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r141, %r142 }, [ %rd36 + 0 ], %rd35; + // end inline asm + bar.sync 0; + st.shared.b32 [%r8], %r141; + st.shared.b32 [%r9+256], %r142; + bar.sync 0; + ld.shared.b32 %r153, [%r10]; + ld.shared.b32 %r154, [%r11]; + .loc 1 65 58 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:58 + or.b32 %r155, %r2, %r146; + .loc 1 65 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35 + mad.wide.s32 %rd38, %r155, 2, %rd8; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r143, %r24; + @%p2 ld.global.L1::evict_first.L2::cache_hint.b32 { %r143 }, [ %rd38 + 0 ], %rd37; + // end inline asm + mov.b32 {%rs30, %rs31}, %r143; + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r156, %rs30; + cvt.f32.bf16 %r157, %rs31; + .loc 1 66 36 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36 + add.s64 %rd40, %rd12, %rd78; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs10; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd40 + 0 ], %rd39; + // end inline asm + .loc 1 66 96 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96 + cvt.f32.bf16 %r158, %rs11; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + cvt.s64.s32 %rd81, %r12; + .loc 1 72 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:57 + cvt.u32.u64 %r159, %rd77; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + cvt.s64.s32 %rd82, %rd77; + add.s64 %rd83, %rd81, %rd82; + shl.b64 %rd84, %rd83, 1; + add.s64 %rd85, %rd8, %rd84; + add.s64 %rd42, %rd85, 2; + add.s64 %rd44, %rd85, 66; + .loc 1 72 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68 + // begin inline asm + mov.u64 %rd41, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd42 + 0 ], %rd41; + // end inline asm + // begin inline asm + mov.u64 %rd43, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd44 + 0 ], %rd43; + // end inline asm + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r160, %rs12; + cvt.f32.bf16 %r161, %rs13; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r162, %r4, %r160; + mul.f32 %r163, %r4, %r161; + .loc 1 80 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35 + shl.b64 %rd86, %rd77, 1; + add.s64 %rd54, %rd9, %rd86; + add.s64 %rd46, %rd54, 2; + add.s64 %rd48, %rd54, 66; + .loc 1 80 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85 + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs14, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd46 + 0 ], %rd45; + // end inline asm + // begin inline asm + mov.u64 %rd47, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd48 + 0 ], %rd47; + // end inline asm + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r164, %rs14; + cvt.f32.bf16 %r165, %rs15; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r166, %r162; + fma.rn.f32 %r167, %r166, %r164, 0f00000000; + neg.f32 %r168, %r163; + fma.rn.f32 %r169, %r168, %r165, 0f00000000; + .loc 1 90 53 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53 + or.b32 %r170, %r12, %r159; + .loc 1 90 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35 + mad.wide.s32 %rd50, %r170, 2, %rd8; + add.s64 %rd52, %rd85, 64; + .loc 1 90 64 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64 + // begin inline asm + mov.u64 %rd49, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs16, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd50 + 0 ], %rd49; + // end inline asm + // begin inline asm + mov.u64 %rd51, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd52 + 0 ], %rd51; + // end inline asm + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r171, %rs16; + cvt.f32.bf16 %r172, %rs17; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r173, %r4, %r171; + mul.f32 %r174, %r4, %r172; + .loc 1 98 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35 + add.s64 %rd56, %rd54, 64; + .loc 1 98 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81 + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs18, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd54 + 0 ], %rd53; + // end inline asm + // begin inline asm + mov.u64 %rd55, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd56 + 0 ], %rd55; + // end inline asm + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r175, %rs18; + cvt.f32.bf16 %r176, %rs19; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r177, %r173, %r175; + mul.f32 %r178, %r174, %r176; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r179, %r167, %r177, %p3; + selp.f32 %r180, %r169, %r178, %p3; + .loc 1 111 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24 + mul.f32 %r181, %r3, %r148; + mul.f32 %r182, %r3, %r149; + .loc 1 113 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24 + bar.sync 0; + st.shared.b32 [%r13], %r150; + bar.sync 0; + ld.shared.b32 %r183, [%r14]; + ld.shared.b32 %r184, [%r15+128]; + mul.f32 %r185, %r181, %r183; + mul.f32 %r186, %r182, %r184; + .loc 1 116 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24 + mul.f32 %r187, %r185, %r139; + mul.f32 %r188, %r186, %r140; + bar.sync 0; + st.shared.b32 [%r8], %r187; + st.shared.b32 [%r9+256], %r188; + bar.sync 0; + ld.shared.b32 %r189, [%r10]; + ld.shared.b32 %r190, [%r11]; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r191, %r153, %r179, %r189; + fma.rn.f32 %r192, %r154, %r180, %r190; + .loc 1 121 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60 + or.b32 %r193, %r16, %r159; + .loc 1 121 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35 + mad.wide.s32 %rd58, %r193, 2, %rd8; + cvt.s64.s32 %rd87, %r16; + add.s64 %rd88, %rd87, %rd82; + shl.b64 %rd89, %rd88, 1; + add.s64 %rd90, %rd8, %rd89; + add.s64 %rd60, %rd90, 64; + .loc 1 121 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71 + // begin inline asm + mov.u64 %rd57, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs20, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd58 + 0 ], %rd57; + // end inline asm + // begin inline asm + mov.u64 %rd59, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd59, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd60 + 0 ], %rd59; + // end inline asm + .loc 1 121 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132 + cvt.f32.bf16 %r194, %rs20; + cvt.f32.bf16 %r195, %rs21; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + mul.f32 %r196, %r6, %r194; + mul.f32 %r197, %r6, %r195; + .loc 1 127 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35 + add.s64 %rd70, %rd12, %rd86; + add.s64 %rd62, %rd70, 2; + add.s64 %rd64, %rd70, 66; + .loc 1 127 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85 + // begin inline asm + mov.u64 %rd61, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd61, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs22, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd62 + 0 ], %rd61; + // end inline asm + // begin inline asm + mov.u64 %rd63, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd64 + 0 ], %rd63; + // end inline asm + .loc 1 127 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146 + cvt.f32.bf16 %r198, %rs22; + cvt.f32.bf16 %r199, %rs23; + .loc 1 131 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17 + neg.f32 %r200, %r196; + fma.rn.f32 %r201, %r200, %r198, 0f00000000; + neg.f32 %r202, %r197; + fma.rn.f32 %r203, %r202, %r199, 0f00000000; + .loc 1 134 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60 + or.b32 %r204, %r17, %r159; + .loc 1 134 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35 + mad.wide.s32 %rd66, %r204, 2, %rd8; + cvt.s64.s32 %rd91, %r17; + add.s64 %rd92, %rd91, %rd82; + shl.b64 %rd93, %rd92, 1; + add.s64 %rd94, %rd8, %rd93; + add.s64 %rd68, %rd94, 64; + .loc 1 134 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71 + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs24, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd66 + 0 ], %rd65; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs25, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd68 + 0 ], %rd67; + // end inline asm + .loc 1 134 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132 + cvt.f32.bf16 %r205, %rs24; + cvt.f32.bf16 %r206, %rs25; + .loc 1 139 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24 + mul.f32 %r207, %r6, %r205; + mul.f32 %r208, %r6, %r206; + .loc 1 140 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35 + add.s64 %rd72, %rd70, 64; + .loc 1 140 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81 + // begin inline asm + mov.u64 %rd69, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd69, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs26, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd70 + 0 ], %rd69; + // end inline asm + // begin inline asm + mov.u64 %rd71, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd72 + 0 ], %rd71; + // end inline asm + .loc 1 140 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142 + cvt.f32.bf16 %r209, %rs26; + cvt.f32.bf16 %r210, %rs27; + .loc 1 142 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24 + mul.f32 %r211, %r207, %r209; + mul.f32 %r212, %r208, %r210; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r213, %r201, %r211, %p3; + selp.f32 %r214, %r203, %r212, %p3; + .loc 1 151 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25 + mul.f32 %r215, %r5, %r156; + mul.f32 %r216, %r5, %r157; + bar.sync 0; + st.shared.b32 [%r8], %r215; + st.shared.b32 [%r9+256], %r216; + bar.sync 0; + ld.shared.b32 %r217, [%r10]; + ld.shared.b32 %r218, [%r11]; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + bar.sync 0; + st.shared.b32 [%r18], %r158; + bar.sync 0; + ld.shared.b32 %r219, [%r19]; + ld.shared.b32 %r220, [%r19+128]; + .loc 1 153 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26 + mul.f32 %r221, %r217, %r219; + mul.f32 %r222, %r218, %r220; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + mul.f32 %r223, %r151, %r221; + mul.f32 %r224, %r152, %r222; + .loc 1 159 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26 + fma.rn.f32 %r225, %r153, %r213, %r223; + fma.rn.f32 %r226, %r154, %r214, %r224; + .loc 1 161 39 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39 + or.b64 %rd95, %rd75, %rd5; + .loc 1 161 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32 + shl.b64 %rd96, %rd95, 1; + add.s64 %rd73, %rd6, %rd96; + .loc 1 161 55 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55 + cvt.rn.bf16.f32 %rs32, %r191; + cvt.rn.bf16.f32 %rs33, %r192; + bar.sync 0; + st.shared.b16 [%r20], %rs32; + st.shared.b16 [%r21], %rs33; + bar.sync 0; + ld.shared.b16 %rs34, [%r22]; + ld.shared.b16 %rs35, [%r22+4]; + mov.b32 %r144, {%rs34, %rs35}; + // begin inline asm + @%p2 st.global.b32 [ %rd73 + 0 ], { %r144 }; + // end inline asm + .loc 1 162 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32 + add.s64 %rd74, %rd7, %rd96; + .loc 1 162 56 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56 + cvt.rn.bf16.f32 %rs36, %r225; + cvt.rn.bf16.f32 %rs37, %r226; + bar.sync 0; + st.shared.b16 [%r20], %rs36; + st.shared.b16 [%r21], %rs37; + bar.sync 0; + ld.shared.b16 %rs38, [%r22]; + ld.shared.b16 %rs39, [%r22+4]; + mov.b32 %r145, {%rs38, %rs39}; + // begin inline asm + @%p2 st.global.b32 [ %rd74 + 0 ], { %r145 }; + // end inline asm + mov.b64 %rd97, 64; + mov.pred %p5, 0; + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + @%p1 bra $L__BB0_1; +// %bb.2: + .loc 1 53 4 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4 + ret; +$L__tmp24: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 456 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 118 +.b8 113 +.b8 104 +.b8 106 +.b8 116 +.b8 121 +.b8 103 +.b8 55 +.b8 102 +.b8 118 +.b8 120 +.b8 122 +.b8 119 +.b8 116 +.b8 98 +.b8 116 +.b8 116 +.b8 52 +.b8 118 +.b8 114 +.b8 100 +.b8 107 +.b8 98 +.b8 110 +.b8 98 +.b8 54 +.b8 110 +.b8 51 +.b8 50 +.b8 102 +.b8 110 +.b8 114 +.b8 105 +.b8 106 +.b8 106 +.b8 112 +.b8 108 +.b8 51 +.b8 118 +.b8 118 +.b8 52 +.b8 99 +.b8 102 +.b8 113 +.b8 100 +.b8 52 +.b8 109 +.b8 122 +.b8 110 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 98 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 101 +.b8 103 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 119 +.b8 105 +.b8 116 +.b8 104 +.b8 95 +.b8 115 +.b8 105 +.b8 122 +.b8 101 +.b8 115 +.b8 95 +.b8 115 +.b8 116 +.b8 97 +.b8 99 +.b8 107 +.b8 95 +.b8 117 +.b8 110 +.b8 98 +.b8 105 +.b8 110 +.b8 100 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x151:0x7a DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 4 // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..2b9c8c254d415643c15bf01722276d94b8160239 --- /dev/null +++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source @@ -0,0 +1,972 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc215 = loc(unknown) +#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc222 = loc("in_out_ptr0"(#loc)) +#loc223 = loc("in_out_ptr1"(#loc)) +#loc224 = loc("in_ptr0"(#loc)) +#loc225 = loc("in_ptr1"(#loc)) +#loc226 = loc("in_ptr2"(#loc)) +#loc227 = loc("in_ptr3"(#loc)) +#loc228 = loc("in_ptr4"(#loc)) +#loc229 = loc("xnumel"(#loc)) +#loc230 = loc("r0_numel"(#loc)) +#loc432 = loc("input"(#loc213)) +#loc433 = loc("a"(#loc218)) +#loc434 = loc("b"(#loc218)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 73728 : i32 loc(#loc231) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc232) + %xoffset = tt.get_program_id x : i32 loc(#loc233) + %xoffset_2 = arith.constant 2 : i32 loc(#loc234) + %xoffset_3 = arith.constant 2 : i32 loc(#loc234) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc235) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc236) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc237) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc237) + %xmask = arith.constant true loc(#loc238) + %xmask_8 = arith.constant dense : tensor<2x64xi1> loc(#loc238) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc239) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc240) + %x0 = arith.constant 32 : i32 loc(#loc241) + %x0_10 = arith.constant 32 : i32 loc(#loc241) + %x0_11 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc241) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<2x1xi32> loc(#loc241) + %x1 = arith.constant 32 : i32 loc(#loc242) + %x1_13 = arith.constant 32 : i32 loc(#loc242) + %x1_14 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc242) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<2x1xi32> loc(#loc242) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc243) + %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244) + %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc244) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c64_i32 = arith.constant 64 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<2x64xf32>, tensor<2x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc246) + %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc246) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc247) + %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x64xi32> loc(#loc247) + %tmp0 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_27 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_28 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc248) + %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x64xi32> loc(#loc248) + %tmp0_30 = arith.constant 128 : i32 loc(#loc249) + %tmp0_31 = arith.constant 128 : i32 loc(#loc249) + %tmp0_32 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc249) + %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<2x1xi32> loc(#loc249) + %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc250) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc250) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<2x64xi32> loc(#loc250) + %tmp0_37 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_38 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_39 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc251) + %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<2x1xi32> loc(#loc251) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc252) + %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<2x64xi32> loc(#loc252) + %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc253) + %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc253) + %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254) + %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc254) + %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc254) + %tmp0_48 = arith.truncf %tmp0_47 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc254) + %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc254) + %tmp0_50 = arith.extf %tmp0_49 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc255) + %tmp6 = arith.constant 128 : i32 loc(#loc256) + %tmp6_51 = arith.constant 128 : i32 loc(#loc256) + %tmp6_52 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc256) + %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<2x1xi32> loc(#loc256) + %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc257) + %tmp6_55 = tt.broadcast %tmp6_53 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc257) + %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<2x64xi32> loc(#loc257) + %tmp6_57 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_58 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_59 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc258) + %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<2x1xi32> loc(#loc258) + %tmp6_61 = tt.broadcast %tmp6_60 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc259) + %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<2x64xi32> loc(#loc259) + %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc260) + %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc260) + %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261) + %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc261) + %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc261) + %tmp6_68 = arith.truncf %tmp6_67 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc261) + %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc261) + %tmp6_70 = arith.extf %tmp6_69 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc262) + %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<2x64xf32> loc(#loc263) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<2x64xf32> loc(#loc264) + %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc265) + %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc265) + %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<2x64xf32> loc(#loc266) + %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<2x64xf32> loc(#loc267) + %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc268) + %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc268) + scf.yield %_tmp4_72, %_tmp10_74 : tensor<2x64xf32>, tensor<2x64xf32> loc(#loc39) + } loc(#loc435) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S2_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<2x64xf32>) -> tensor<2xf32> loc(#loc269) + %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc270) + %tmp10 = tt.call @"triton.language.standard.sum__fp32S2_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<2x64xf32>) -> tensor<2xf32> loc(#loc271) + %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc272) + %c0_i32_21 = arith.constant 0 : i32 loc(#loc44) + %c64_i32_22 = arith.constant 64 : i32 loc(#loc44) + %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44) + %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44) + %6 = arith.bitcast %c64_i32_22 : i32 to i32 loc(#loc44) + %7 = ub.poison : i32 loc(#loc44) + scf.for %r0_offset = %4 to %5 step %6 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc273) + %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc273) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc274) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x64xi32> loc(#loc274) + %r0_3 = arith.constant 2 : i32 loc(#loc275) + %r0_3_25 = arith.constant 2 : i32 loc(#loc275) + %r0_3_26 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc275) + %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x64xi32> loc(#loc275) + %r0_4 = arith.constant 2 : i32 loc(#loc276) + %r0_4_28 = arith.constant 2 : i32 loc(#loc276) + %r0_4_29 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc276) + %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x64xi32> loc(#loc276) + %tmp50 = arith.constant 128 : i32 loc(#loc277) + %tmp50_31 = arith.constant 128 : i32 loc(#loc277) + %tmp50_32 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc277) + %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<2x1xi32> loc(#loc277) + %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc278) + %tmp50_35 = tt.broadcast %tmp50_33 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc278) + %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<2x64xi32> loc(#loc278) + %tmp50_37 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_38 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_39 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc279) + %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<2x1xi32> loc(#loc279) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc280) + %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<2x64xi32> loc(#loc280) + %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc281) + %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc281) + %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282) + %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc282) + %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc282) + %tmp50_48 = arith.truncf %tmp50_47 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc282) + %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc282) + %tmp50_50 = arith.extf %tmp50_49 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc283) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc284) + %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc284) + %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285) + %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc285) + %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc285) + %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc285) + %tmp58_56 = arith.extf %tmp58_55 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc286) + %tmp63 = arith.constant 128 : i32 loc(#loc287) + %tmp63_57 = arith.constant 128 : i32 loc(#loc287) + %tmp63_58 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc287) + %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<2x1xi32> loc(#loc287) + %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc288) + %tmp63_61 = tt.broadcast %tmp63_59 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc288) + %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<2x64xi32> loc(#loc288) + %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc289) + %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc289) + %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290) + %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc290) + %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc290) + %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc290) + %tmp66 = arith.constant 128 : i32 loc(#loc291) + %tmp66_69 = arith.constant 128 : i32 loc(#loc291) + %tmp66_70 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc291) + %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<2x1xi32> loc(#loc291) + %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc292) + %tmp66_73 = tt.broadcast %tmp66_71 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc292) + %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<2x64xi32> loc(#loc292) + %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc293) + %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc293) + %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294) + %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc294) + %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc294) + %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc294) + %tmp96 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_81 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_82 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc295) + %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x64xi32> loc(#loc295) + %tmp96_84 = arith.constant 128 : i32 loc(#loc296) + %tmp96_85 = arith.constant 128 : i32 loc(#loc296) + %tmp96_86 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc296) + %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<2x1xi32> loc(#loc296) + %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc297) + %tmp96_89 = tt.broadcast %tmp96_87 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc297) + %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<2x64xi32> loc(#loc297) + %tmp96_91 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_92 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_93 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc298) + %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<2x1xi32> loc(#loc298) + %tmp96_95 = tt.broadcast %tmp96_94 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc299) + %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<2x64xi32> loc(#loc299) + %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc300) + %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc300) + %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301) + %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc301) + %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc301) + %tmp96_102 = arith.truncf %tmp96_101 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc301) + %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<2x64x!tt.ptr> loc(#loc301) + %tmp96_104 = arith.extf %tmp96_103 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc302) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc303) + %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc303) + %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304) + %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc304) + %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc304) + %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc304) + %tmp102_110 = arith.extf %tmp102_109 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc305) + %tmp13 = arith.constant 0 : i64 loc(#loc306) + %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306) + %tmp14 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc307) + %tmp14_112 = arith.constant dense<0> : tensor<1x64xi64> loc(#loc307) + %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x64xi64> loc(#loc307) + %tmp15 = arith.constant 1 : i64 loc(#loc308) + %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308) + %tmp16 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc309) + %tmp16_115 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc309) + %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x64xi64> loc(#loc309) + %tmp17 = arith.constant 2 : i32 loc(#loc310) + %tmp17_117 = arith.constant 2 : i32 loc(#loc310) + %tmp17_118 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc310) + %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x64xi32> loc(#loc310) + %tmp17_120 = arith.constant 1 : i32 loc(#loc311) + %tmp17_121 = arith.constant 1 : i32 loc(#loc311) + %tmp17_122 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc311) + %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x64xi32> loc(#loc311) + %tmp17_124 = arith.constant 128 : i32 loc(#loc312) + %tmp17_125 = arith.constant 128 : i32 loc(#loc312) + %tmp17_126 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc312) + %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<2x1xi32> loc(#loc312) + %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc313) + %tmp17_129 = tt.broadcast %tmp17_127 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc313) + %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<2x64xi32> loc(#loc313) + %tmp17_131 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_132 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_133 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc314) + %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<2x1xi32> loc(#loc314) + %tmp17_135 = tt.broadcast %tmp17_134 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc315) + %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<2x64xi32> loc(#loc315) + %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc316) + %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc316) + %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc317) + %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318) + %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc318) + %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc318) + %tmp17_143 = arith.truncf %tmp17_142 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc318) + %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc318) + %tmp17_145 = arith.extf %tmp17_144 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc319) + %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320) + %tmp20 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc321) + %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<2x1xf32> loc(#loc321) + %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322) + %tmp22 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc323) + %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<2x1xf32> loc(#loc323) + %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc324) + %tmp24 = tt.broadcast %tmp23 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc325) + %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<2x64xf32> loc(#loc325) + %tmp25 = arith.constant 2 : i32 loc(#loc326) + %tmp25_149 = arith.constant 2 : i32 loc(#loc326) + %tmp25_150 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc326) + %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x64xi32> loc(#loc326) + %tmp25_152 = arith.constant 1 : i32 loc(#loc327) + %tmp25_153 = arith.constant 1 : i32 loc(#loc327) + %tmp25_154 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc327) + %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x64xi32> loc(#loc327) + %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc328) + %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc329) + %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc329) + %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc330) + %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331) + %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc331) + %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc331) + %tmp25_163 = arith.truncf %tmp25_162 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc331) + %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc331) + %tmp25_165 = arith.extf %tmp25_164 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc332) + %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<2x64xf32> loc(#loc333) + %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334) + %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc334) + %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<2x64xf32> loc(#loc334) + %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335) + %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc335) + %tmp31 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc336) + %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc336) + %tmp32 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc337) + %tmp32_170 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc337) + %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x64xi64> loc(#loc337) + %tmp33 = arith.constant 2 : i64 loc(#loc338) + %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338) + %tmp34 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc339) + %tmp34_173 = arith.constant dense<2> : tensor<1x64xi64> loc(#loc339) + %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x64xi64> loc(#loc339) + %tmp35 = arith.constant 2 : i32 loc(#loc340) + %tmp35_175 = arith.constant 2 : i32 loc(#loc340) + %tmp35_176 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc340) + %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x64xi32> loc(#loc340) + %tmp35_178 = arith.constant 128 : i32 loc(#loc341) + %tmp35_179 = arith.constant 128 : i32 loc(#loc341) + %tmp35_180 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc341) + %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<2x1xi32> loc(#loc341) + %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc342) + %tmp35_183 = tt.broadcast %tmp35_181 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc342) + %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<2x64xi32> loc(#loc342) + %tmp35_185 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_186 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_187 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc343) + %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<2x1xi32> loc(#loc343) + %tmp35_189 = tt.broadcast %tmp35_188 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc344) + %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<2x64xi32> loc(#loc344) + %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc345) + %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc345) + %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc346) + %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347) + %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc347) + %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc347) + %tmp35_197 = arith.truncf %tmp35_196 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc347) + %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc347) + %tmp35_199 = arith.extf %tmp35_198 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc348) + %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349) + %tmp38 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc350) + %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<2x1xf32> loc(#loc350) + %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351) + %tmp40 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc352) + %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<2x1xf32> loc(#loc352) + %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc353) + %tmp42 = tt.broadcast %tmp41 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc354) + %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<2x64xf32> loc(#loc354) + %tmp43 = arith.constant 2 : i32 loc(#loc355) + %tmp43_203 = arith.constant 2 : i32 loc(#loc355) + %tmp43_204 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc355) + %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x64xi32> loc(#loc355) + %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc356) + %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc357) + %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc357) + %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc358) + %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359) + %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc359) + %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc359) + %tmp43_213 = arith.truncf %tmp43_212 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc359) + %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc359) + %tmp43_215 = arith.extf %tmp43_214 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc360) + %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<2x64xf32> loc(#loc361) + %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362) + %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc362) + %tmp48 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc363) + %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc363) + %tmp49 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc364) + %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc364) + %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365) + %tmp53 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc366) + %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<2x1xf32> loc(#loc366) + %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367) + %tmp55 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc368) + %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<2x1xf32> loc(#loc368) + %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc369) + %tmp57 = tt.broadcast %tmp56 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc370) + %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<2x64xf32> loc(#loc370) + %tmp60 = tt.broadcast %tmp58_56 : tensor<1x64xf32> -> tensor<2x64xf32> loc(#loc371) + %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<2x64xf32> loc(#loc371) + %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<2x64xf32> loc(#loc372) + %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<2x64xf32> loc(#loc373) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<2x64xf32> loc(#loc374) + %tmp70 = arith.constant 2 : i32 loc(#loc375) + %tmp70_223 = arith.constant 2 : i32 loc(#loc375) + %tmp70_224 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc375) + %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x64xi32> loc(#loc375) + %tmp70_226 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_227 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_228 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc376) + %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x64xi32> loc(#loc376) + %tmp70_230 = arith.constant 128 : i32 loc(#loc377) + %tmp70_231 = arith.constant 128 : i32 loc(#loc377) + %tmp70_232 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc377) + %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<2x1xi32> loc(#loc377) + %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc378) + %tmp70_235 = tt.broadcast %tmp70_233 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc378) + %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<2x64xi32> loc(#loc378) + %tmp70_237 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_238 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_239 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc379) + %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<2x1xi32> loc(#loc379) + %tmp70_241 = tt.broadcast %tmp70_240 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc380) + %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<2x64xi32> loc(#loc380) + %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc381) + %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc381) + %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc382) + %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383) + %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc383) + %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc383) + %tmp70_249 = arith.truncf %tmp70_248 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc383) + %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc383) + %tmp70_251 = arith.extf %tmp70_250 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc384) + %tmp72 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc385) + %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<2x1xf32> loc(#loc385) + %tmp73 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc386) + %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<2x1xf32> loc(#loc386) + %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc387) + %tmp75 = tt.broadcast %tmp74 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc388) + %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<2x64xf32> loc(#loc388) + %tmp76 = arith.constant 2 : i32 loc(#loc389) + %tmp76_255 = arith.constant 2 : i32 loc(#loc389) + %tmp76_256 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc389) + %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x64xi32> loc(#loc389) + %tmp76_258 = arith.constant 1 : i32 loc(#loc390) + %tmp76_259 = arith.constant 1 : i32 loc(#loc390) + %tmp76_260 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc390) + %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x64xi32> loc(#loc390) + %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc391) + %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc392) + %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc392) + %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc393) + %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394) + %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc394) + %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc394) + %tmp76_269 = arith.truncf %tmp76_268 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc394) + %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc394) + %tmp76_271 = arith.extf %tmp76_270 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc395) + %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<2x64xf32> loc(#loc396) + %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397) + %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc397) + %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<2x64xf32> loc(#loc397) + %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398) + %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc398) + %tmp82 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc399) + %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc399) + %tmp83 = arith.constant 2 : i32 loc(#loc400) + %tmp83_276 = arith.constant 2 : i32 loc(#loc400) + %tmp83_277 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc400) + %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x64xi32> loc(#loc400) + %tmp83_279 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_280 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_281 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc401) + %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x64xi32> loc(#loc401) + %tmp83_283 = arith.constant 128 : i32 loc(#loc402) + %tmp83_284 = arith.constant 128 : i32 loc(#loc402) + %tmp83_285 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc402) + %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<2x1xi32> loc(#loc402) + %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc403) + %tmp83_288 = tt.broadcast %tmp83_286 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc403) + %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<2x64xi32> loc(#loc403) + %tmp83_290 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_291 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_292 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc404) + %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<2x1xi32> loc(#loc404) + %tmp83_294 = tt.broadcast %tmp83_293 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc405) + %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<2x64xi32> loc(#loc405) + %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc406) + %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc406) + %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc407) + %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408) + %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc408) + %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc408) + %tmp83_302 = arith.truncf %tmp83_301 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc408) + %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc408) + %tmp83_304 = arith.extf %tmp83_303 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc409) + %tmp85 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc410) + %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<2x1xf32> loc(#loc410) + %tmp86 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc411) + %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<2x1xf32> loc(#loc411) + %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc412) + %tmp88 = tt.broadcast %tmp87 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc413) + %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<2x64xf32> loc(#loc413) + %tmp89 = arith.constant 2 : i32 loc(#loc414) + %tmp89_308 = arith.constant 2 : i32 loc(#loc414) + %tmp89_309 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc414) + %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x64xi32> loc(#loc414) + %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc415) + %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc416) + %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc416) + %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc417) + %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418) + %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc418) + %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc418) + %tmp89_318 = arith.truncf %tmp89_317 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc418) + %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc418) + %tmp89_320 = arith.extf %tmp89_319 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc419) + %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<2x64xf32> loc(#loc420) + %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421) + %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc421) + %tmp94 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc422) + %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc422) + %tmp95 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc423) + %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc423) + %tmp98 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc424) + %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<2x1xf32> loc(#loc424) + %tmp99 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc425) + %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<2x1xf32> loc(#loc425) + %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc426) + %tmp101 = tt.broadcast %tmp100 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc427) + %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<2x64xf32> loc(#loc427) + %tmp104 = tt.broadcast %tmp102_110 : tensor<1x64xf32> -> tensor<2x64xf32> loc(#loc428) + %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<2x64xf32> loc(#loc428) + %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<2x64xf32> loc(#loc429) + %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<2x64xf32> loc(#loc430) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<2x64xf32> loc(#loc431) + %c128_i32 = arith.constant 128 : i32 loc(#loc204) + %c128_i32_328 = arith.constant 128 : i32 loc(#loc204) + %cst = arith.constant dense<128> : tensor<2x1xi32> loc(#loc204) + %8 = arith.muli %cst, %xindex_7 : tensor<2x1xi32> loc(#loc204) + %9 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc205) + %10 = tt.broadcast %8 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc205) + %11 = arith.addi %9, %10 : tensor<2x64xi32> loc(#loc205) + %12 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc206) + %13 = tt.addptr %12, %11 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc206) + %14 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc207) + %15 = arith.truncf %tmp68 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc207) + tt.store %13, %15, %14 : tensor<2x64x!tt.ptr> loc(#loc207) + %c128_i32_329 = arith.constant 128 : i32 loc(#loc208) + %c128_i32_330 = arith.constant 128 : i32 loc(#loc208) + %cst_331 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc208) + %16 = arith.muli %cst_331, %xindex_7 : tensor<2x1xi32> loc(#loc208) + %17 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc209) + %18 = tt.broadcast %16 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc209) + %19 = arith.addi %17, %18 : tensor<2x64xi32> loc(#loc209) + %20 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc210) + %21 = tt.addptr %20, %19 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc210) + %22 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc211) + %23 = arith.truncf %tmp110 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc211) + tt.store %21, %23, %22 : tensor<2x64x!tt.ptr> loc(#loc211) + } loc(#loc44) + tt.return loc(#loc212) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S2_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x64xf32> loc("input"(#loc213))) -> tensor<2xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214) + tt.reduce.return %2 : f32 loc(#loc214) + }) : (tensor<2x64xf32>) -> tensor<2xf32> loc(#loc214) + tt.return %0 : tensor<2xf32> loc(#loc216) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2xf32> loc(#loc217) + tt.return %1 : tensor<2xf32> loc(#loc217) + } loc(#loc213) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc219) + tt.return %0 : f32 loc(#loc220) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc221) + tt.return %1 : f32 loc(#loc221) + } loc(#loc218) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55) +#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66) +#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81) +#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57) +#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55) +#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63) +#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95) +#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42) +#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44) +#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55) +#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66) +#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81) +#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24) +#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24) +#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32) +#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53) +#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59) +#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91) +#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42) +#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24) +#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24) +#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33) +#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43) +#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39) +#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc231 = loc("xnumel"(#loc1)) +#loc232 = loc("r0_numel"(#loc2)) +#loc233 = loc("xoffset"(#loc3)) +#loc234 = loc("xoffset"(#loc4)) +#loc235 = loc("xindex"(#loc5)) +#loc236 = loc("xindex"(#loc6)) +#loc237 = loc("xindex"(#loc7)) +#loc238 = loc("xmask"(#loc8)) +#loc239 = loc("r0_base"(#loc9)) +#loc240 = loc("r0_base"(#loc10)) +#loc241 = loc("x0"(#loc11)) +#loc242 = loc("x1"(#loc12)) +#loc243 = loc("_tmp4"(#loc13)) +#loc244 = loc("_tmp10"(#loc14)) +#loc245 = loc("_tmp4"(#loc15)) +#loc246 = loc("r0_index"(#loc16)) +#loc247 = loc("r0_mask"(#loc17)) +#loc248 = loc("tmp0"(#loc18)) +#loc249 = loc("tmp0"(#loc19)) +#loc250 = loc("tmp0"(#loc20)) +#loc251 = loc("tmp0"(#loc21)) +#loc252 = loc("tmp0"(#loc22)) +#loc253 = loc("tmp0"(#loc23)) +#loc254 = loc("tmp0"(#loc24)) +#loc255 = loc("tmp0"(#loc25)) +#loc256 = loc("tmp6"(#loc26)) +#loc257 = loc("tmp6"(#loc27)) +#loc258 = loc("tmp6"(#loc28)) +#loc259 = loc("tmp6"(#loc29)) +#loc260 = loc("tmp6"(#loc30)) +#loc261 = loc("tmp6"(#loc31)) +#loc262 = loc("tmp6"(#loc32)) +#loc263 = loc("tmp2"(#loc33)) +#loc264 = loc("tmp5"(#loc34)) +#loc265 = loc("_tmp4"(#loc35)) +#loc266 = loc("tmp8"(#loc36)) +#loc267 = loc("tmp11"(#loc37)) +#loc268 = loc("_tmp10"(#loc38)) +#loc269 = loc("tmp4"(#loc40)) +#loc270 = loc("tmp4"(#loc41)) +#loc271 = loc("tmp10"(#loc42)) +#loc272 = loc("tmp10"(#loc43)) +#loc273 = loc("r0_index"(#loc45)) +#loc274 = loc("r0_mask"(#loc46)) +#loc275 = loc("r0_3"(#loc47)) +#loc276 = loc("r0_4"(#loc48)) +#loc277 = loc("tmp50"(#loc49)) +#loc278 = loc("tmp50"(#loc50)) +#loc279 = loc("tmp50"(#loc51)) +#loc280 = loc("tmp50"(#loc52)) +#loc281 = loc("tmp50"(#loc53)) +#loc282 = loc("tmp50"(#loc54)) +#loc283 = loc("tmp50"(#loc55)) +#loc284 = loc("tmp58"(#loc56)) +#loc285 = loc("tmp58"(#loc57)) +#loc286 = loc("tmp58"(#loc58)) +#loc287 = loc("tmp63"(#loc59)) +#loc288 = loc("tmp63"(#loc60)) +#loc289 = loc("tmp63"(#loc61)) +#loc290 = loc("tmp63"(#loc62)) +#loc291 = loc("tmp66"(#loc63)) +#loc292 = loc("tmp66"(#loc64)) +#loc293 = loc("tmp66"(#loc65)) +#loc294 = loc("tmp66"(#loc66)) +#loc295 = loc("tmp96"(#loc67)) +#loc296 = loc("tmp96"(#loc68)) +#loc297 = loc("tmp96"(#loc69)) +#loc298 = loc("tmp96"(#loc70)) +#loc299 = loc("tmp96"(#loc71)) +#loc300 = loc("tmp96"(#loc72)) +#loc301 = loc("tmp96"(#loc73)) +#loc302 = loc("tmp96"(#loc74)) +#loc303 = loc("tmp102"(#loc75)) +#loc304 = loc("tmp102"(#loc76)) +#loc305 = loc("tmp102"(#loc77)) +#loc306 = loc("tmp13"(#loc78)) +#loc307 = loc("tmp14"(#loc79)) +#loc308 = loc("tmp15"(#loc80)) +#loc309 = loc("tmp16"(#loc81)) +#loc310 = loc("tmp17"(#loc82)) +#loc311 = loc("tmp17"(#loc83)) +#loc312 = loc("tmp17"(#loc84)) +#loc313 = loc("tmp17"(#loc85)) +#loc314 = loc("tmp17"(#loc86)) +#loc315 = loc("tmp17"(#loc87)) +#loc316 = loc("tmp17"(#loc88)) +#loc317 = loc("tmp17"(#loc89)) +#loc318 = loc("tmp17"(#loc90)) +#loc319 = loc("tmp17"(#loc91)) +#loc320 = loc("tmp19"(#loc92)) +#loc321 = loc("tmp20"(#loc93)) +#loc322 = loc("tmp21"(#loc94)) +#loc323 = loc("tmp22"(#loc95)) +#loc324 = loc("tmp23"(#loc96)) +#loc325 = loc("tmp24"(#loc97)) +#loc326 = loc("tmp25"(#loc98)) +#loc327 = loc("tmp25"(#loc99)) +#loc328 = loc("tmp25"(#loc100)) +#loc329 = loc("tmp25"(#loc101)) +#loc330 = loc("tmp25"(#loc102)) +#loc331 = loc("tmp25"(#loc103)) +#loc332 = loc("tmp25"(#loc104)) +#loc333 = loc("tmp27"(#loc105)) +#loc334 = loc("tmp29"(#loc106)) +#loc335 = loc("tmp30"(#loc107)) +#loc336 = loc("tmp31"(#loc108)) +#loc337 = loc("tmp32"(#loc109)) +#loc338 = loc("tmp33"(#loc110)) +#loc339 = loc("tmp34"(#loc111)) +#loc340 = loc("tmp35"(#loc112)) +#loc341 = loc("tmp35"(#loc113)) +#loc342 = loc("tmp35"(#loc114)) +#loc343 = loc("tmp35"(#loc115)) +#loc344 = loc("tmp35"(#loc116)) +#loc345 = loc("tmp35"(#loc117)) +#loc346 = loc("tmp35"(#loc118)) +#loc347 = loc("tmp35"(#loc119)) +#loc348 = loc("tmp35"(#loc120)) +#loc349 = loc("tmp37"(#loc121)) +#loc350 = loc("tmp38"(#loc122)) +#loc351 = loc("tmp39"(#loc123)) +#loc352 = loc("tmp40"(#loc124)) +#loc353 = loc("tmp41"(#loc125)) +#loc354 = loc("tmp42"(#loc126)) +#loc355 = loc("tmp43"(#loc127)) +#loc356 = loc("tmp43"(#loc128)) +#loc357 = loc("tmp43"(#loc129)) +#loc358 = loc("tmp43"(#loc130)) +#loc359 = loc("tmp43"(#loc131)) +#loc360 = loc("tmp43"(#loc132)) +#loc361 = loc("tmp45"(#loc133)) +#loc362 = loc("tmp47"(#loc134)) +#loc363 = loc("tmp48"(#loc135)) +#loc364 = loc("tmp49"(#loc136)) +#loc365 = loc("tmp52"(#loc137)) +#loc366 = loc("tmp53"(#loc138)) +#loc367 = loc("tmp54"(#loc139)) +#loc368 = loc("tmp55"(#loc140)) +#loc369 = loc("tmp56"(#loc141)) +#loc370 = loc("tmp57"(#loc142)) +#loc371 = loc("tmp60"(#loc143)) +#loc372 = loc("tmp64"(#loc144)) +#loc373 = loc("tmp67"(#loc145)) +#loc374 = loc("tmp68"(#loc146)) +#loc375 = loc("tmp70"(#loc147)) +#loc376 = loc("tmp70"(#loc148)) +#loc377 = loc("tmp70"(#loc149)) +#loc378 = loc("tmp70"(#loc150)) +#loc379 = loc("tmp70"(#loc151)) +#loc380 = loc("tmp70"(#loc152)) +#loc381 = loc("tmp70"(#loc153)) +#loc382 = loc("tmp70"(#loc154)) +#loc383 = loc("tmp70"(#loc155)) +#loc384 = loc("tmp70"(#loc156)) +#loc385 = loc("tmp72"(#loc157)) +#loc386 = loc("tmp73"(#loc158)) +#loc387 = loc("tmp74"(#loc159)) +#loc388 = loc("tmp75"(#loc160)) +#loc389 = loc("tmp76"(#loc161)) +#loc390 = loc("tmp76"(#loc162)) +#loc391 = loc("tmp76"(#loc163)) +#loc392 = loc("tmp76"(#loc164)) +#loc393 = loc("tmp76"(#loc165)) +#loc394 = loc("tmp76"(#loc166)) +#loc395 = loc("tmp76"(#loc167)) +#loc396 = loc("tmp78"(#loc168)) +#loc397 = loc("tmp80"(#loc169)) +#loc398 = loc("tmp81"(#loc170)) +#loc399 = loc("tmp82"(#loc171)) +#loc400 = loc("tmp83"(#loc172)) +#loc401 = loc("tmp83"(#loc173)) +#loc402 = loc("tmp83"(#loc174)) +#loc403 = loc("tmp83"(#loc175)) +#loc404 = loc("tmp83"(#loc176)) +#loc405 = loc("tmp83"(#loc177)) +#loc406 = loc("tmp83"(#loc178)) +#loc407 = loc("tmp83"(#loc179)) +#loc408 = loc("tmp83"(#loc180)) +#loc409 = loc("tmp83"(#loc181)) +#loc410 = loc("tmp85"(#loc182)) +#loc411 = loc("tmp86"(#loc183)) +#loc412 = loc("tmp87"(#loc184)) +#loc413 = loc("tmp88"(#loc185)) +#loc414 = loc("tmp89"(#loc186)) +#loc415 = loc("tmp89"(#loc187)) +#loc416 = loc("tmp89"(#loc188)) +#loc417 = loc("tmp89"(#loc189)) +#loc418 = loc("tmp89"(#loc190)) +#loc419 = loc("tmp89"(#loc191)) +#loc420 = loc("tmp91"(#loc192)) +#loc421 = loc("tmp93"(#loc193)) +#loc422 = loc("tmp94"(#loc194)) +#loc423 = loc("tmp95"(#loc195)) +#loc424 = loc("tmp98"(#loc196)) +#loc425 = loc("tmp99"(#loc197)) +#loc426 = loc("tmp100"(#loc198)) +#loc427 = loc("tmp101"(#loc199)) +#loc428 = loc("tmp104"(#loc200)) +#loc429 = loc("tmp107"(#loc201)) +#loc430 = loc("tmp109"(#loc202)) +#loc431 = loc("tmp110"(#loc203)) +#loc435 = loc("_tmp10"(#loc245)) diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2e4ed4d50da7439900f325d63a2505365f09cf0c --- /dev/null +++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir @@ -0,0 +1,557 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc147 = loc("in_out_ptr0"(#loc)) +#loc148 = loc("in_out_ptr1"(#loc)) +#loc149 = loc("in_ptr0"(#loc)) +#loc150 = loc("in_ptr1"(#loc)) +#loc151 = loc("in_ptr2"(#loc)) +#loc152 = loc("in_ptr3"(#loc)) +#loc153 = loc("in_ptr4"(#loc)) +#loc154 = loc("xnumel"(#loc)) +#loc155 = loc("r0_numel"(#loc)) +#loc185 = loc("tmp4"(#loc33)) +#loc187 = loc("tmp10"(#loc36)) +#loc292 = loc(callsite(#loc1 at #loc185)) +#loc294 = loc(callsite(#loc1 at #loc187)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4097> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x64xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<36864> : tensor<2x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<36864> : tensor<2x1xi32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<128> : tensor<2x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<2x1xi32, #blocked1> loc(#loc1) + %cst_7 = arith.constant dense<4096> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_9 = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_11 = arith.constant dense<32> : tensor<2x1xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<2x1xi32, #blocked1> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %cst_13 = arith.constant dense<0.000000e+00> : tensor<2x64xbf16, #blocked1> loc(#loc1) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<2x64xbf16, #blocked> loc(#loc1) + %cst_15 = arith.constant dense<128> : tensor<1x64xi32, #blocked2> loc(#loc1) + %cst_16 = arith.constant dense<0.000000e+00> : tensor<1x64xbf16, #blocked2> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_17 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32, #blocked1> loc(#loc1) + %cst_18 = arith.constant dense<1.280000e+02> : tensor<2x1xf32, #blocked1> loc(#loc1) + %cst_19 = arith.constant dense<0.000000e+00> : tensor<2x64xf32, #blocked> loc(#loc1) + %cst_20 = arith.constant dense<0.000000e+00> : tensor<2x64xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc156) + %xoffset_21 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc157) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158) + %xindex_22 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158) + %xindex_23 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xi32, #blocked1> loc(#loc158) + %xindex_24 = tt.expand_dims %xindex_22 {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2x1xi32, #blocked> loc(#loc158) + %xindex_25 = tt.splat %xoffset_21 : i32 -> tensor<2x1xi32, #blocked1> loc(#loc159) + %xindex_26 = tt.splat %xoffset_21 : i32 -> tensor<2x1xi32, #blocked> loc(#loc159) + %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<2x1xi32, #blocked1> loc(#loc159) + %xindex_28 = arith.addi %xindex_26, %xindex_24 : tensor<2x1xi32, #blocked> loc(#loc159) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160) + %r0_base_29 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160) + %r0_base_30 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> loc(#loc160) + %r0_base_31 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc160) + %r0_base_32 = tt.expand_dims %r0_base_29 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc160) + %r0_base_33 = tt.expand_dims %r0_base_30 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x64xi32, #blocked2> loc(#loc160) + %x0 = arith.remsi %xindex_27, %cst_12 : tensor<2x1xi32, #blocked1> loc(#loc161) + %x0_34 = arith.remsi %xindex_28, %cst_11 : tensor<2x1xi32, #blocked> loc(#loc161) + %x1 = arith.divsi %xindex_27, %cst_12 : tensor<2x1xi32, #blocked1> loc(#loc162) + %x1_35 = arith.divsi %xindex_28, %cst_11 : tensor<2x1xi32, #blocked> loc(#loc162) + %tmp0 = arith.muli %x0, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc163) + %tmp0_36 = tt.broadcast %tmp0 : tensor<2x1xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc164) + %tmp0_37 = arith.muli %x1, %cst_4 : tensor<2x1xi32, #blocked1> loc(#loc165) + %tmp0_38 = tt.broadcast %tmp0_37 : tensor<2x1xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc166) + %tmp0_39 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr, #blocked1> loc(#loc167) + %_tmp10:2 = scf.for %_tmp10_54 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg10 = %cst_20, %arg11 = %cst_20) -> (tensor<2x64xf32, #blocked1>, tensor<2x64xf32, #blocked1>) : i32 { + %r0_index = tt.splat %_tmp10_54 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc169) + %r0_index_55 = arith.addi %r0_index, %r0_base_31 : tensor<1x64xi32, #blocked1> loc(#loc169) + %r0_mask = arith.cmpi slt, %r0_index_55, %cst_10 : tensor<1x64xi32, #blocked1> loc(#loc170) + %tmp0_56 = arith.addi %r0_index_55, %cst_8 : tensor<1x64xi32, #blocked1> loc(#loc171) + %tmp0_57 = tt.broadcast %tmp0_56 : tensor<1x64xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc164) + %tmp0_58 = arith.addi %tmp0_57, %tmp0_36 : tensor<2x64xi32, #blocked1> loc(#loc164) + %tmp0_59 = arith.addi %tmp0_58, %tmp0_38 : tensor<2x64xi32, #blocked1> loc(#loc166) + %tmp0_60 = tt.addptr %tmp0_39, %tmp0_59 : tensor<2x64x!tt.ptr, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc167) + %tmp0_61 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<2x64xi1, #blocked1> loc(#loc172) + %tmp0_62 = tt.load %tmp0_60, %tmp0_61, %cst_13 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked1> loc(#loc172) + %tmp0_63 = arith.extf %tmp0_62 : tensor<2x64xbf16, #blocked1> to tensor<2x64xf32, #blocked1> loc(#loc173) + %tmp6 = tt.broadcast %r0_index_55 : tensor<1x64xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc174) + %tmp6_64 = arith.addi %tmp6, %tmp0_36 : tensor<2x64xi32, #blocked1> loc(#loc174) + %tmp6_65 = arith.addi %tmp6_64, %tmp0_38 : tensor<2x64xi32, #blocked1> loc(#loc175) + %tmp6_66 = tt.addptr %tmp0_39, %tmp6_65 : tensor<2x64x!tt.ptr, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc176) + %tmp6_67 = tt.load %tmp6_66, %tmp0_61, %cst_13 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked1> loc(#loc177) + %tmp6_68 = arith.extf %tmp6_67 : tensor<2x64xbf16, #blocked1> to tensor<2x64xf32, #blocked1> loc(#loc178) + %tmp2 = arith.mulf %tmp0_63, %tmp0_63 : tensor<2x64xf32, #blocked1> loc(#loc179) + %tmp5 = arith.addf %arg10, %tmp2 : tensor<2x64xf32, #blocked1> loc(#loc180) + %_tmp4 = arith.select %tmp0_61, %tmp5, %arg10 : tensor<2x64xi1, #blocked1>, tensor<2x64xf32, #blocked1> loc(#loc181) + %tmp8 = arith.mulf %tmp6_68, %tmp6_68 : tensor<2x64xf32, #blocked1> loc(#loc182) + %tmp11 = arith.addf %arg11, %tmp8 : tensor<2x64xf32, #blocked1> loc(#loc183) + %_tmp10_69 = arith.select %tmp0_61, %tmp11, %arg11 : tensor<2x64xi1, #blocked1>, tensor<2x64xf32, #blocked1> loc(#loc184) + scf.yield %_tmp4, %_tmp10_69 : tensor<2x64xf32, #blocked1>, tensor<2x64xf32, #blocked1> loc(#loc31) + } loc(#loc290) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_54: f32 loc(callsite(#loc1 at #loc185)), %tmp4_55: f32 loc(callsite(#loc1 at #loc185))): + %tmp4_56 = arith.addf %tmp4_54, %tmp4_55 : f32 loc(#loc297) + tt.reduce.return %tmp4_56 : f32 loc(#loc291) + }) : (tensor<2x64xf32, #blocked1>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291) + %tmp4_40 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc186) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_54: f32 loc(callsite(#loc1 at #loc187)), %tmp10_55: f32 loc(callsite(#loc1 at #loc187))): + %tmp10_56 = arith.addf %tmp10_54, %tmp10_55 : f32 loc(#loc298) + tt.reduce.return %tmp10_56 : f32 loc(#loc293) + }) : (tensor<2x64xf32, #blocked1>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293) + %tmp10_41 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc188) + %tmp50 = arith.muli %x0_34, %cst_5 : tensor<2x1xi32, #blocked> loc(#loc189) + %tmp50_42 = tt.broadcast %tmp50 : tensor<2x1xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc190) + %tmp50_43 = arith.muli %x1_35, %cst_3 : tensor<2x1xi32, #blocked> loc(#loc191) + %tmp50_44 = tt.broadcast %tmp50_43 : tensor<2x1xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc192) + %tmp50_45 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr, #blocked> loc(#loc193) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked> loc(#loc194) + %tmp58_46 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked2> loc(#loc194) + %tmp63 = arith.muli %x1, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc195) + %tmp63_47 = tt.broadcast %tmp63 : tensor<2x1xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc196) + %tmp63_48 = tt.splat %in_ptr2 : !tt.ptr -> tensor<2x64x!tt.ptr, #blocked1> loc(#loc197) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<2x64x!tt.ptr, #blocked1> loc(#loc198) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked> loc(#loc199) + %tmp102_49 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked2> loc(#loc199) + %tmp20 = arith.divf %tmp10_41, %cst_18 : tensor<2x1xf32, #blocked1> loc(#loc200) + %tmp22 = arith.addf %tmp20, %cst_17 : tensor<2x1xf32, #blocked1> loc(#loc201) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked1>) -> tensor<2x1xf32, #blocked1> loc(#loc202) + %tmp24 = ttg.convert_layout %tmp23 : tensor<2x1xf32, #blocked1> -> tensor<2x1xf32, #blocked> loc(#loc203) + %tmp24_50 = tt.broadcast %tmp24 : tensor<2x1xf32, #blocked> -> tensor<2x64xf32, #blocked> loc(#loc203) + %tmp24_51 = tt.broadcast %tmp23 : tensor<2x1xf32, #blocked1> -> tensor<2x64xf32, #blocked1> loc(#loc203) + %tmp72 = arith.divf %tmp4_40, %cst_18 : tensor<2x1xf32, #blocked1> loc(#loc204) + %tmp73 = arith.addf %tmp72, %cst_17 : tensor<2x1xf32, #blocked1> loc(#loc205) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked1>) -> tensor<2x1xf32, #blocked1> loc(#loc206) + %tmp75 = ttg.convert_layout %tmp74 : tensor<2x1xf32, #blocked1> -> tensor<2x1xf32, #blocked> loc(#loc207) + %tmp75_52 = tt.broadcast %tmp75 : tensor<2x1xf32, #blocked> -> tensor<2x64xf32, #blocked> loc(#loc207) + %tmp75_53 = tt.broadcast %tmp74 : tensor<2x1xf32, #blocked1> -> tensor<2x64xf32, #blocked1> loc(#loc207) + %0 = arith.muli %xindex_27, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc57) + %1 = tt.broadcast %0 : tensor<2x1xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc58) + %2 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr, #blocked1> loc(#loc59) + %3 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<2x64x!tt.ptr, #blocked1> loc(#loc60) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked1> loc(#loc208) + %r0_index_54 = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked> loc(#loc208) + %r0_index_55 = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked2> loc(#loc208) + %r0_index_56 = arith.addi %r0_index, %r0_base_31 : tensor<1x64xi32, #blocked1> loc(#loc208) + %r0_index_57 = arith.addi %r0_index_54, %r0_base_32 : tensor<1x64xi32, #blocked> loc(#loc208) + %r0_index_58 = arith.addi %r0_index_55, %r0_base_33 : tensor<1x64xi32, #blocked2> loc(#loc208) + %r0_mask = arith.cmpi slt, %r0_index_56, %cst_10 : tensor<1x64xi32, #blocked1> loc(#loc209) + %r0_mask_59 = arith.cmpi slt, %r0_index_57, %cst_9 : tensor<1x64xi32, #blocked> loc(#loc209) + %r0_mask_60 = arith.cmpi slt, %r0_index_58, %cst_15 : tensor<1x64xi32, #blocked2> loc(#loc209) + %r0_3 = arith.remsi %r0_index_57, %cst_2 : tensor<1x64xi32, #blocked> loc(#loc210) + %r0_4 = arith.divsi %r0_index_57, %cst_2 : tensor<1x64xi32, #blocked> loc(#loc211) + %tmp50_61 = tt.broadcast %r0_index_56 : tensor<1x64xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc190) + %tmp50_62 = arith.addi %tmp50_61, %tmp0_36 : tensor<2x64xi32, #blocked1> loc(#loc190) + %tmp50_63 = arith.addi %tmp50_62, %tmp0_38 : tensor<2x64xi32, #blocked1> loc(#loc192) + %tmp50_64 = tt.addptr %tmp0_39, %tmp50_63 : tensor<2x64x!tt.ptr, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc193) + %tmp50_65 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<2x64xi1, #blocked1> loc(#loc212) + %tmp50_66 = tt.load %tmp50_64, %tmp50_65, %cst_13 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked1> loc(#loc212) + %tmp50_67 = arith.extf %tmp50_66 : tensor<2x64xbf16, #blocked1> to tensor<2x64xf32, #blocked1> loc(#loc213) + %tmp58_68 = tt.addptr %tmp58_46, %r0_index_58 : tensor<1x64x!tt.ptr, #blocked2>, tensor<1x64xi32, #blocked2> loc(#loc194) + %tmp58_69 = tt.load %tmp58_68, %r0_mask_60, %cst_16 evictionPolicy = evict_last : tensor<1x64x!tt.ptr, #blocked2> loc(#loc214) + %tmp58_70 = arith.extf %tmp58_69 : tensor<1x64xbf16, #blocked2> to tensor<1x64xf32, #blocked2> loc(#loc215) + %tmp63_71 = arith.addi %tmp50_61, %tmp63_47 : tensor<2x64xi32, #blocked1> loc(#loc196) + %tmp63_72 = tt.addptr %tmp63_48, %tmp63_71 : tensor<2x64x!tt.ptr, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc197) + %tmp63_73 = tt.load %tmp63_72, %tmp50_65, %cst_20 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked1> loc(#loc216) + %tmp63_74 = ttg.convert_layout %tmp63_73 : tensor<2x64xf32, #blocked1> -> tensor<2x64xf32, #blocked> loc(#loc216) + %tmp66_75 = tt.addptr %tmp66, %tmp63_71 : tensor<2x64x!tt.ptr, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc198) + %tmp66_76 = tt.load %tmp66_75, %tmp50_65, %cst_20 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked1> loc(#loc217) + %tmp66_77 = ttg.convert_layout %tmp66_76 : tensor<2x64xf32, #blocked1> -> tensor<2x64xf32, #blocked> loc(#loc217) + %tmp96 = arith.addi %r0_index_56, %cst_8 : tensor<1x64xi32, #blocked1> loc(#loc218) + %tmp96_78 = tt.broadcast %tmp96 : tensor<1x64xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc219) + %tmp96_79 = arith.addi %tmp96_78, %tmp0_36 : tensor<2x64xi32, #blocked1> loc(#loc219) + %tmp96_80 = arith.addi %tmp96_79, %tmp0_38 : tensor<2x64xi32, #blocked1> loc(#loc220) + %tmp96_81 = tt.addptr %tmp0_39, %tmp96_80 : tensor<2x64x!tt.ptr, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc221) + %tmp96_82 = tt.load %tmp96_81, %tmp50_65, %cst_13 evictionPolicy = evict_first : tensor<2x64x!tt.ptr, #blocked1> loc(#loc222) + %tmp96_83 = arith.extf %tmp96_82 : tensor<2x64xbf16, #blocked1> to tensor<2x64xf32, #blocked1> loc(#loc223) + %tmp102_84 = tt.addptr %tmp102_49, %r0_index_58 : tensor<1x64x!tt.ptr, #blocked2>, tensor<1x64xi32, #blocked2> loc(#loc199) + %tmp102_85 = tt.load %tmp102_84, %r0_mask_60, %cst_16 evictionPolicy = evict_last : tensor<1x64x!tt.ptr, #blocked2> loc(#loc224) + %tmp102_86 = arith.extf %tmp102_85 : tensor<1x64xbf16, #blocked2> to tensor<1x64xf32, #blocked2> loc(#loc225) + %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc226) + %tmp16_87 = arith.cmpi slt, %tmp16, %cst_1 : tensor<1x64xi64, #blocked> loc(#loc226) + %tmp17 = arith.muli %r0_4, %cst_2 : tensor<1x64xi32, #blocked> loc(#loc227) + %tmp17_88 = arith.addi %tmp17, %cst_0 : tensor<1x64xi32, #blocked> loc(#loc228) + %tmp17_89 = tt.broadcast %tmp17_88 : tensor<1x64xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc229) + %tmp17_90 = arith.addi %tmp17_89, %tmp50_42 : tensor<2x64xi32, #blocked> loc(#loc229) + %tmp17_91 = arith.addi %tmp17_90, %tmp50_44 : tensor<2x64xi32, #blocked> loc(#loc230) + %tmp17_92 = tt.addptr %tmp50_45, %tmp17_91 : tensor<2x64x!tt.ptr, #blocked>, tensor<2x64xi32, #blocked> loc(#loc231) + %tmp17_93 = arith.andi %r0_mask_59, %tmp16_87 : tensor<1x64xi1, #blocked> loc(#loc232) + %tmp17_94 = tt.broadcast %tmp17_93 : tensor<1x64xi1, #blocked> -> tensor<2x64xi1, #blocked> loc(#loc233) + %tmp17_95 = tt.load %tmp17_92, %tmp17_94, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked> loc(#loc233) + %tmp17_96 = arith.extf %tmp17_95 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc234) + %tmp24_97 = arith.mulf %tmp17_96, %tmp24_50 : tensor<2x64xf32, #blocked> loc(#loc203) + %tmp25 = tt.addptr %tmp58, %tmp17_88 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc235) + %tmp25_98 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr, #blocked> -> tensor<2x64x!tt.ptr, #blocked> loc(#loc235) + %tmp25_99 = tt.load %tmp25_98, %tmp17_94, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked> loc(#loc236) + %tmp25_100 = arith.extf %tmp25_99 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc237) + %tmp27 = arith.mulf %tmp24_97, %tmp25_100 : tensor<2x64xf32, #blocked> loc(#loc238) + %tmp29 = arith.subf %cst_19, %tmp27 : tensor<2x64xf32, #blocked> loc(#loc239) + %tmp31 = tt.broadcast %tmp16_87 : tensor<1x64xi1, #blocked> -> tensor<2x64xi1, #blocked> loc(#loc240) + %tmp32 = arith.cmpi sge, %tmp16, %cst_1 : tensor<1x64xi64, #blocked> loc(#loc241) + %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc242) + %tmp35_101 = arith.addi %tmp35, %tmp50_42 : tensor<2x64xi32, #blocked> loc(#loc242) + %tmp35_102 = arith.addi %tmp35_101, %tmp50_44 : tensor<2x64xi32, #blocked> loc(#loc243) + %tmp35_103 = tt.addptr %tmp50_45, %tmp35_102 : tensor<2x64x!tt.ptr, #blocked>, tensor<2x64xi32, #blocked> loc(#loc244) + %tmp35_104 = arith.andi %r0_mask_59, %tmp32 : tensor<1x64xi1, #blocked> loc(#loc245) + %tmp35_105 = tt.broadcast %tmp35_104 : tensor<1x64xi1, #blocked> -> tensor<2x64xi1, #blocked> loc(#loc246) + %tmp35_106 = tt.load %tmp35_103, %tmp35_105, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked> loc(#loc246) + %tmp35_107 = arith.extf %tmp35_106 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc247) + %tmp42 = arith.mulf %tmp35_107, %tmp24_50 : tensor<2x64xf32, #blocked> loc(#loc248) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc249) + %tmp43_108 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr, #blocked> -> tensor<2x64x!tt.ptr, #blocked> loc(#loc249) + %tmp43_109 = tt.load %tmp43_108, %tmp35_105, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked> loc(#loc250) + %tmp43_110 = arith.extf %tmp43_109 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc251) + %tmp45 = arith.mulf %tmp42, %tmp43_110 : tensor<2x64xf32, #blocked> loc(#loc252) + %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1, #blocked> -> tensor<2x64xi1, #blocked> loc(#loc253) + %tmp48_111 = arith.select %tmp48, %tmp45, %cst_19 : tensor<2x64xi1, #blocked>, tensor<2x64xf32, #blocked> loc(#loc253) + %tmp49 = arith.select %tmp31, %tmp29, %tmp48_111 : tensor<2x64xi1, #blocked>, tensor<2x64xf32, #blocked> loc(#loc295) + %tmp57 = arith.mulf %tmp50_67, %tmp24_51 : tensor<2x64xf32, #blocked1> loc(#loc255) + %tmp60 = ttg.convert_layout %tmp58_70 : tensor<1x64xf32, #blocked2> -> tensor<1x64xf32, #blocked1> loc(#loc256) + %tmp60_112 = tt.broadcast %tmp60 : tensor<1x64xf32, #blocked1> -> tensor<2x64xf32, #blocked1> loc(#loc256) + %tmp60_113 = arith.mulf %tmp57, %tmp60_112 : tensor<2x64xf32, #blocked1> loc(#loc256) + %tmp64 = arith.mulf %tmp60_113, %tmp63_73 : tensor<2x64xf32, #blocked1> loc(#loc257) + %tmp64_114 = ttg.convert_layout %tmp64 : tensor<2x64xf32, #blocked1> -> tensor<2x64xf32, #blocked> loc(#loc257) + %tmp67 = arith.mulf %tmp49, %tmp66_77 : tensor<2x64xf32, #blocked> loc(#loc258) + %tmp68 = arith.addf %tmp64_114, %tmp67 : tensor<2x64xf32, #blocked> loc(#loc259) + %tmp70 = arith.addi %tmp17, %cst : tensor<1x64xi32, #blocked> loc(#loc260) + %tmp70_115 = tt.broadcast %tmp70 : tensor<1x64xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc261) + %tmp70_116 = arith.addi %tmp70_115, %tmp50_42 : tensor<2x64xi32, #blocked> loc(#loc261) + %tmp70_117 = arith.addi %tmp70_116, %tmp50_44 : tensor<2x64xi32, #blocked> loc(#loc262) + %tmp70_118 = tt.addptr %tmp50_45, %tmp70_117 : tensor<2x64x!tt.ptr, #blocked>, tensor<2x64xi32, #blocked> loc(#loc263) + %tmp70_119 = tt.load %tmp70_118, %tmp17_94, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked> loc(#loc264) + %tmp70_120 = arith.extf %tmp70_119 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc265) + %tmp75_121 = arith.mulf %tmp70_120, %tmp75_52 : tensor<2x64xf32, #blocked> loc(#loc207) + %tmp76 = tt.addptr %tmp102, %tmp17_88 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc266) + %tmp76_122 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr, #blocked> -> tensor<2x64x!tt.ptr, #blocked> loc(#loc266) + %tmp76_123 = tt.load %tmp76_122, %tmp17_94, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked> loc(#loc267) + %tmp76_124 = arith.extf %tmp76_123 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc268) + %tmp78 = arith.mulf %tmp75_121, %tmp76_124 : tensor<2x64xf32, #blocked> loc(#loc269) + %tmp80 = arith.subf %cst_19, %tmp78 : tensor<2x64xf32, #blocked> loc(#loc270) + %tmp83 = arith.addi %tmp17, %cst_7 : tensor<1x64xi32, #blocked> loc(#loc271) + %tmp83_125 = tt.broadcast %tmp83 : tensor<1x64xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc272) + %tmp83_126 = arith.addi %tmp83_125, %tmp50_42 : tensor<2x64xi32, #blocked> loc(#loc272) + %tmp83_127 = arith.addi %tmp83_126, %tmp50_44 : tensor<2x64xi32, #blocked> loc(#loc273) + %tmp83_128 = tt.addptr %tmp50_45, %tmp83_127 : tensor<2x64x!tt.ptr, #blocked>, tensor<2x64xi32, #blocked> loc(#loc274) + %tmp83_129 = tt.load %tmp83_128, %tmp35_105, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked> loc(#loc275) + %tmp83_130 = arith.extf %tmp83_129 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc276) + %tmp88 = arith.mulf %tmp83_130, %tmp75_52 : tensor<2x64xf32, #blocked> loc(#loc277) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc278) + %tmp89_131 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr, #blocked> -> tensor<2x64x!tt.ptr, #blocked> loc(#loc278) + %tmp89_132 = tt.load %tmp89_131, %tmp35_105, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr, #blocked> loc(#loc279) + %tmp89_133 = arith.extf %tmp89_132 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc280) + %tmp91 = arith.mulf %tmp88, %tmp89_133 : tensor<2x64xf32, #blocked> loc(#loc281) + %tmp94 = arith.select %tmp48, %tmp91, %cst_19 : tensor<2x64xi1, #blocked>, tensor<2x64xf32, #blocked> loc(#loc282) + %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<2x64xi1, #blocked>, tensor<2x64xf32, #blocked> loc(#loc296) + %tmp101 = arith.mulf %tmp96_83, %tmp75_53 : tensor<2x64xf32, #blocked1> loc(#loc285) + %tmp101_134 = ttg.convert_layout %tmp101 : tensor<2x64xf32, #blocked1> -> tensor<2x64xf32, #blocked> loc(#loc285) + %tmp107 = ttg.convert_layout %tmp102_86 : tensor<1x64xf32, #blocked2> -> tensor<1x64xf32, #blocked> loc(#loc286) + %tmp104 = tt.broadcast %tmp107 : tensor<1x64xf32, #blocked> -> tensor<2x64xf32, #blocked> loc(#loc287) + %tmp104_135 = arith.mulf %tmp101_134, %tmp104 : tensor<2x64xf32, #blocked> loc(#loc287) + %tmp107_136 = arith.mulf %tmp104_135, %tmp63_74 : tensor<2x64xf32, #blocked> loc(#loc286) + %tmp109 = arith.mulf %tmp95, %tmp66_77 : tensor<2x64xf32, #blocked> loc(#loc288) + %tmp110 = arith.addf %tmp107_136, %tmp109 : tensor<2x64xf32, #blocked> loc(#loc289) + %4 = arith.addi %tmp50_61, %1 : tensor<2x64xi32, #blocked1> loc(#loc58) + %5 = tt.addptr %2, %4 : tensor<2x64x!tt.ptr, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc59) + %6 = arith.truncf %tmp68 : tensor<2x64xf32, #blocked> to tensor<2x64xbf16, #blocked> loc(#loc144) + %7 = ttg.convert_layout %6 : tensor<2x64xbf16, #blocked> -> tensor<2x64xbf16, #blocked1> loc(#loc144) + tt.store %5, %7, %tmp50_65 : tensor<2x64x!tt.ptr, #blocked1> loc(#loc144) + %8 = tt.addptr %3, %4 : tensor<2x64x!tt.ptr, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc60) + %9 = arith.truncf %tmp110 : tensor<2x64xf32, #blocked> to tensor<2x64xbf16, #blocked> loc(#loc145) + %10 = ttg.convert_layout %9 : tensor<2x64xbf16, #blocked> -> tensor<2x64xbf16, #blocked1> loc(#loc145) + tt.store %8, %10, %tmp50_65 : tensor<2x64x!tt.ptr, #blocked1> loc(#loc145) + } loc(#loc61) + tt.return loc(#loc146) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc156 = loc("xoffset"(#loc2)) +#loc157 = loc("xoffset"(#loc3)) +#loc158 = loc("xindex"(#loc4)) +#loc159 = loc("xindex"(#loc5)) +#loc160 = loc("r0_base"(#loc6)) +#loc161 = loc("x0"(#loc7)) +#loc162 = loc("x1"(#loc8)) +#loc163 = loc("tmp0"(#loc9)) +#loc164 = loc("tmp0"(#loc10)) +#loc165 = loc("tmp0"(#loc11)) +#loc166 = loc("tmp0"(#loc12)) +#loc167 = loc("tmp0"(#loc13)) +#loc168 = loc("_tmp4"(#loc14)) +#loc169 = loc("r0_index"(#loc15)) +#loc170 = loc("r0_mask"(#loc16)) +#loc171 = loc("tmp0"(#loc17)) +#loc172 = loc("tmp0"(#loc18)) +#loc173 = loc("tmp0"(#loc19)) +#loc174 = loc("tmp6"(#loc20)) +#loc175 = loc("tmp6"(#loc21)) +#loc176 = loc("tmp6"(#loc22)) +#loc177 = loc("tmp6"(#loc23)) +#loc178 = loc("tmp6"(#loc24)) +#loc179 = loc("tmp2"(#loc25)) +#loc180 = loc("tmp5"(#loc26)) +#loc181 = loc("_tmp4"(#loc27)) +#loc182 = loc("tmp8"(#loc28)) +#loc183 = loc("tmp11"(#loc29)) +#loc184 = loc("_tmp10"(#loc30)) +#loc186 = loc("tmp4"(#loc35)) +#loc188 = loc("tmp10"(#loc37)) +#loc189 = loc("tmp50"(#loc38)) +#loc190 = loc("tmp50"(#loc39)) +#loc191 = loc("tmp50"(#loc40)) +#loc192 = loc("tmp50"(#loc41)) +#loc193 = loc("tmp50"(#loc42)) +#loc194 = loc("tmp58"(#loc43)) +#loc195 = loc("tmp63"(#loc44)) +#loc196 = loc("tmp63"(#loc45)) +#loc197 = loc("tmp63"(#loc46)) +#loc198 = loc("tmp66"(#loc47)) +#loc199 = loc("tmp102"(#loc48)) +#loc200 = loc("tmp20"(#loc49)) +#loc201 = loc("tmp22"(#loc50)) +#loc202 = loc("tmp23"(#loc51)) +#loc203 = loc("tmp24"(#loc52)) +#loc204 = loc("tmp72"(#loc53)) +#loc205 = loc("tmp73"(#loc54)) +#loc206 = loc("tmp74"(#loc55)) +#loc207 = loc("tmp75"(#loc56)) +#loc208 = loc("r0_index"(#loc62)) +#loc209 = loc("r0_mask"(#loc63)) +#loc210 = loc("r0_3"(#loc64)) +#loc211 = loc("r0_4"(#loc65)) +#loc212 = loc("tmp50"(#loc66)) +#loc213 = loc("tmp50"(#loc67)) +#loc214 = loc("tmp58"(#loc68)) +#loc215 = loc("tmp58"(#loc69)) +#loc216 = loc("tmp63"(#loc70)) +#loc217 = loc("tmp66"(#loc71)) +#loc218 = loc("tmp96"(#loc72)) +#loc219 = loc("tmp96"(#loc73)) +#loc220 = loc("tmp96"(#loc74)) +#loc221 = loc("tmp96"(#loc75)) +#loc222 = loc("tmp96"(#loc76)) +#loc223 = loc("tmp96"(#loc77)) +#loc224 = loc("tmp102"(#loc78)) +#loc225 = loc("tmp102"(#loc79)) +#loc226 = loc("tmp16"(#loc80)) +#loc227 = loc("tmp17"(#loc81)) +#loc228 = loc("tmp17"(#loc82)) +#loc229 = loc("tmp17"(#loc83)) +#loc230 = loc("tmp17"(#loc84)) +#loc231 = loc("tmp17"(#loc85)) +#loc232 = loc("tmp17"(#loc86)) +#loc233 = loc("tmp17"(#loc87)) +#loc234 = loc("tmp17"(#loc88)) +#loc235 = loc("tmp25"(#loc89)) +#loc236 = loc("tmp25"(#loc90)) +#loc237 = loc("tmp25"(#loc91)) +#loc238 = loc("tmp27"(#loc92)) +#loc239 = loc("tmp29"(#loc93)) +#loc240 = loc("tmp31"(#loc94)) +#loc241 = loc("tmp32"(#loc95)) +#loc242 = loc("tmp35"(#loc96)) +#loc243 = loc("tmp35"(#loc97)) +#loc244 = loc("tmp35"(#loc98)) +#loc245 = loc("tmp35"(#loc99)) +#loc246 = loc("tmp35"(#loc100)) +#loc247 = loc("tmp35"(#loc101)) +#loc248 = loc("tmp42"(#loc102)) +#loc249 = loc("tmp43"(#loc103)) +#loc250 = loc("tmp43"(#loc104)) +#loc251 = loc("tmp43"(#loc105)) +#loc252 = loc("tmp45"(#loc106)) +#loc253 = loc("tmp48"(#loc107)) +#loc254 = loc("tmp49"(#loc108)) +#loc255 = loc("tmp57"(#loc109)) +#loc256 = loc("tmp60"(#loc110)) +#loc257 = loc("tmp64"(#loc111)) +#loc258 = loc("tmp67"(#loc112)) +#loc259 = loc("tmp68"(#loc113)) +#loc260 = loc("tmp70"(#loc114)) +#loc261 = loc("tmp70"(#loc115)) +#loc262 = loc("tmp70"(#loc116)) +#loc263 = loc("tmp70"(#loc117)) +#loc264 = loc("tmp70"(#loc118)) +#loc265 = loc("tmp70"(#loc119)) +#loc266 = loc("tmp76"(#loc120)) +#loc267 = loc("tmp76"(#loc121)) +#loc268 = loc("tmp76"(#loc122)) +#loc269 = loc("tmp78"(#loc123)) +#loc270 = loc("tmp80"(#loc124)) +#loc271 = loc("tmp83"(#loc125)) +#loc272 = loc("tmp83"(#loc126)) +#loc273 = loc("tmp83"(#loc127)) +#loc274 = loc("tmp83"(#loc128)) +#loc275 = loc("tmp83"(#loc129)) +#loc276 = loc("tmp83"(#loc130)) +#loc277 = loc("tmp88"(#loc131)) +#loc278 = loc("tmp89"(#loc132)) +#loc279 = loc("tmp89"(#loc133)) +#loc280 = loc("tmp89"(#loc134)) +#loc281 = loc("tmp91"(#loc135)) +#loc282 = loc("tmp94"(#loc136)) +#loc283 = loc("tmp95"(#loc137)) +#loc284 = loc("tmp82"(#loc138)) +#loc285 = loc("tmp101"(#loc139)) +#loc286 = loc("tmp107"(#loc140)) +#loc287 = loc("tmp104"(#loc141)) +#loc288 = loc("tmp109"(#loc142)) +#loc289 = loc("tmp110"(#loc143)) +#loc290 = loc("_tmp10"(#loc168)) +#loc291 = loc(callsite(#loc32 at #loc185)) +#loc293 = loc(callsite(#loc32 at #loc187)) +#loc295 = loc(fused[#loc254, #loc240]) +#loc296 = loc(fused[#loc283, #loc284]) +#loc297 = loc(callsite(#loc34 at #loc291)) +#loc298 = loc(callsite(#loc34 at #loc293)) diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..bd8d9c8a1fb1ec460cc41f61a355383d788a3c4c --- /dev/null +++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir @@ -0,0 +1,520 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc149 = loc("in_out_ptr0"(#loc)) +#loc150 = loc("in_out_ptr1"(#loc)) +#loc151 = loc("in_ptr0"(#loc)) +#loc152 = loc("in_ptr1"(#loc)) +#loc153 = loc("in_ptr2"(#loc)) +#loc154 = loc("in_ptr3"(#loc)) +#loc155 = loc("in_ptr4"(#loc)) +#loc156 = loc("xnumel"(#loc)) +#loc157 = loc("r0_numel"(#loc)) +#loc189 = loc("tmp4"(#loc35)) +#loc191 = loc("tmp10"(#loc38)) +#loc296 = loc(callsite(#loc1 at #loc189)) +#loc298 = loc(callsite(#loc1 at #loc191)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x64xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<2x64xbf16> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_1 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc1) + %cst_6 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc1) + %cst_7 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc158) + %xoffset_13 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc159) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc160) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc161) + %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<2x1xi32> loc(#loc162) + %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<2x1xi32> loc(#loc162) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc163) + %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc164) + %x0 = arith.remsi %xindex_16, %cst_12 : tensor<2x1xi32> loc(#loc165) + %x1 = arith.divsi %xindex_16, %cst_12 : tensor<2x1xi32> loc(#loc166) + %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_20 = %cst_11) -> (tensor<2x64xf32>, tensor<2x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc168) + %r0_index_21 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc168) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst_10 : tensor<1x64xi32> loc(#loc169) + %tmp0 = arith.addi %r0_index_21, %cst_9 : tensor<1x64xi32> loc(#loc170) + %tmp0_22 = arith.muli %x0, %cst_8 : tensor<2x1xi32> loc(#loc171) + %tmp0_23 = tt.broadcast %tmp0 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc172) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc172) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<2x64xi32> loc(#loc172) + %tmp0_26 = arith.muli %x1, %cst_7 : tensor<2x1xi32> loc(#loc173) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc174) + %tmp0_28 = arith.addi %tmp0_25, %tmp0_27 : tensor<2x64xi32> loc(#loc174) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc175) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc175) + %tmp0_31 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc176) + %tmp0_32 = tt.load %tmp0_30, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc176) + %tmp0_33 = arith.extf %tmp0_32 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc177) + %tmp6 = tt.broadcast %r0_index_21 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc178) + %tmp6_34 = arith.addi %tmp6, %tmp0_24 : tensor<2x64xi32> loc(#loc178) + %tmp6_35 = arith.addi %tmp6_34, %tmp0_27 : tensor<2x64xi32> loc(#loc179) + %tmp6_36 = tt.addptr %tmp0_29, %tmp6_35 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc180) + %tmp6_37 = tt.load %tmp6_36, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc181) + %tmp6_38 = arith.extf %tmp6_37 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc182) + %tmp2 = arith.mulf %tmp0_33, %tmp0_33 : tensor<2x64xf32> loc(#loc183) + %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<2x64xf32> loc(#loc184) + %_tmp4_39 = arith.select %tmp0_31, %tmp5, %_tmp4 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc185) + %tmp8 = arith.mulf %tmp6_38, %tmp6_38 : tensor<2x64xf32> loc(#loc186) + %tmp11 = arith.addf %_tmp10_20, %tmp8 : tensor<2x64xf32> loc(#loc187) + %_tmp10_40 = arith.select %tmp0_31, %tmp11, %_tmp10_20 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc188) + scf.yield %_tmp4_39, %_tmp10_40 : tensor<2x64xf32>, tensor<2x64xf32> loc(#loc33) + } loc(#loc294) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc189)), %tmp4_21: f32 loc(callsite(#loc1 at #loc189))): + %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc299) + tt.reduce.return %tmp4_22 : f32 loc(#loc295) + }) : (tensor<2x64xf32>) -> tensor<2xf32> loc(#loc295) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc190) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_20: f32 loc(callsite(#loc1 at #loc191)), %tmp10_21: f32 loc(callsite(#loc1 at #loc191))): + %tmp10_22 = arith.addf %tmp10_20, %tmp10_21 : f32 loc(#loc300) + tt.reduce.return %tmp10_22 : f32 loc(#loc297) + }) : (tensor<2x64xf32>) -> tensor<2xf32> loc(#loc297) + %tmp10_19 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc192) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc193) + %r0_index_20 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc193) + %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x64xi32> loc(#loc194) + %r0_3 = arith.remsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc195) + %r0_4 = arith.divsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc196) + %tmp50 = arith.muli %x0, %cst_8 : tensor<2x1xi32> loc(#loc197) + %tmp50_21 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc198) + %tmp50_22 = tt.broadcast %tmp50 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc198) + %tmp50_23 = arith.addi %tmp50_21, %tmp50_22 : tensor<2x64xi32> loc(#loc198) + %tmp50_24 = arith.muli %x1, %cst_7 : tensor<2x1xi32> loc(#loc199) + %tmp50_25 = tt.broadcast %tmp50_24 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc200) + %tmp50_26 = arith.addi %tmp50_23, %tmp50_25 : tensor<2x64xi32> loc(#loc200) + %tmp50_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc201) + %tmp50_28 = tt.addptr %tmp50_27, %tmp50_26 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc201) + %tmp50_29 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc202) + %tmp50_30 = tt.load %tmp50_28, %tmp50_29, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc202) + %tmp50_31 = arith.extf %tmp50_30 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc203) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc204) + %tmp58_32 = tt.addptr %tmp58, %r0_index_20 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc204) + %tmp58_33 = tt.load %tmp58_32, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc205) + %tmp58_34 = arith.extf %tmp58_33 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc206) + %tmp63 = arith.muli %x1, %cst_8 : tensor<2x1xi32> loc(#loc207) + %tmp63_35 = tt.broadcast %tmp63 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc208) + %tmp63_36 = arith.addi %tmp50_21, %tmp63_35 : tensor<2x64xi32> loc(#loc208) + %tmp63_37 = tt.splat %in_ptr2 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc209) + %tmp63_38 = tt.addptr %tmp63_37, %tmp63_36 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc209) + %tmp63_39 = tt.load %tmp63_38, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc210) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc211) + %tmp66_40 = tt.addptr %tmp66, %tmp63_36 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc211) + %tmp66_41 = tt.load %tmp66_40, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc212) + %tmp96 = arith.addi %r0_index_20, %cst_9 : tensor<1x64xi32> loc(#loc213) + %tmp96_42 = tt.broadcast %tmp96 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc214) + %tmp96_43 = arith.addi %tmp96_42, %tmp50_22 : tensor<2x64xi32> loc(#loc214) + %tmp96_44 = arith.addi %tmp96_43, %tmp50_25 : tensor<2x64xi32> loc(#loc215) + %tmp96_45 = tt.addptr %tmp50_27, %tmp96_44 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc216) + %tmp96_46 = tt.load %tmp96_45, %tmp50_29, %cst_0 evictionPolicy = evict_first : tensor<2x64x!tt.ptr> loc(#loc217) + %tmp96_47 = arith.extf %tmp96_46 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc218) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc219) + %tmp102_48 = tt.addptr %tmp102, %r0_index_20 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc219) + %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc220) + %tmp102_50 = arith.extf %tmp102_49 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc221) + %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc222) + %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc222) + %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x64xi32> loc(#loc223) + %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x64xi32> loc(#loc224) + %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc225) + %tmp17_54 = arith.addi %tmp17_53, %tmp50_22 : tensor<2x64xi32> loc(#loc225) + %tmp17_55 = arith.addi %tmp17_54, %tmp50_25 : tensor<2x64xi32> loc(#loc226) + %tmp17_56 = tt.addptr %tmp50_27, %tmp17_55 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc227) + %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x64xi1> loc(#loc228) + %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc229) + %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc229) + %tmp17_60 = arith.extf %tmp17_59 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc230) + %tmp20 = arith.divf %tmp10_19, %cst_3 : tensor<2x1xf32> loc(#loc231) + %tmp22 = arith.addf %tmp20, %cst_2 : tensor<2x1xf32> loc(#loc232) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc233) + %tmp24 = tt.broadcast %tmp23 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc234) + %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<2x64xf32> loc(#loc234) + %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc235) + %tmp25_62 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr> -> tensor<2x64x!tt.ptr> loc(#loc235) + %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc236) + %tmp25_64 = arith.extf %tmp25_63 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc237) + %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<2x64xf32> loc(#loc238) + %tmp29 = arith.subf %cst_11, %tmp27 : tensor<2x64xf32> loc(#loc239) + %tmp31 = tt.broadcast %tmp16_51 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc240) + %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc240) + %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc241) + %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc242) + %tmp35_66 = arith.addi %tmp35, %tmp50_22 : tensor<2x64xi32> loc(#loc242) + %tmp35_67 = arith.addi %tmp35_66, %tmp50_25 : tensor<2x64xi32> loc(#loc243) + %tmp35_68 = tt.addptr %tmp50_27, %tmp35_67 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc244) + %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x64xi1> loc(#loc245) + %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc246) + %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc246) + %tmp35_72 = arith.extf %tmp35_71 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc247) + %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<2x64xf32> loc(#loc248) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc249) + %tmp43_73 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr> -> tensor<2x64x!tt.ptr> loc(#loc249) + %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc250) + %tmp43_75 = arith.extf %tmp43_74 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc251) + %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<2x64xf32> loc(#loc252) + %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc253) + %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc253) + %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc254) + %tmp57 = arith.mulf %tmp50_31, %tmp24 : tensor<2x64xf32> loc(#loc255) + %tmp60 = tt.broadcast %tmp58_34 : tensor<1x64xf32> -> tensor<2x64xf32> loc(#loc256) + %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<2x64xf32> loc(#loc256) + %tmp64 = arith.mulf %tmp60_77, %tmp63_39 : tensor<2x64xf32> loc(#loc257) + %tmp67 = arith.mulf %tmp49, %tmp66_41 : tensor<2x64xf32> loc(#loc258) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<2x64xf32> loc(#loc259) + %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32> loc(#loc260) + %tmp70_78 = tt.broadcast %tmp70 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc261) + %tmp70_79 = arith.addi %tmp70_78, %tmp50_22 : tensor<2x64xi32> loc(#loc261) + %tmp70_80 = arith.addi %tmp70_79, %tmp50_25 : tensor<2x64xi32> loc(#loc262) + %tmp70_81 = tt.addptr %tmp50_27, %tmp70_80 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc263) + %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc264) + %tmp70_83 = arith.extf %tmp70_82 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc265) + %tmp72 = arith.divf %tmp4_18, %cst_3 : tensor<2x1xf32> loc(#loc266) + %tmp73 = arith.addf %tmp72, %cst_2 : tensor<2x1xf32> loc(#loc267) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc268) + %tmp75 = tt.broadcast %tmp74 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc269) + %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<2x64xf32> loc(#loc269) + %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc270) + %tmp76_85 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr> -> tensor<2x64x!tt.ptr> loc(#loc270) + %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc271) + %tmp76_87 = arith.extf %tmp76_86 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc272) + %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<2x64xf32> loc(#loc273) + %tmp80 = arith.subf %cst_11, %tmp78 : tensor<2x64xf32> loc(#loc274) + %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc275) + %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x64xi32> loc(#loc276) + %tmp83_88 = tt.broadcast %tmp83 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc277) + %tmp83_89 = arith.addi %tmp83_88, %tmp50_22 : tensor<2x64xi32> loc(#loc277) + %tmp83_90 = arith.addi %tmp83_89, %tmp50_25 : tensor<2x64xi32> loc(#loc278) + %tmp83_91 = tt.addptr %tmp50_27, %tmp83_90 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc279) + %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc280) + %tmp83_93 = arith.extf %tmp83_92 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc281) + %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<2x64xf32> loc(#loc282) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc283) + %tmp89_94 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr> -> tensor<2x64x!tt.ptr> loc(#loc283) + %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr> loc(#loc284) + %tmp89_96 = arith.extf %tmp89_95 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc285) + %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<2x64xf32> loc(#loc286) + %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc287) + %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc288) + %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<2x64xf32> loc(#loc289) + %tmp104 = tt.broadcast %tmp102_50 : tensor<1x64xf32> -> tensor<2x64xf32> loc(#loc290) + %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<2x64xf32> loc(#loc290) + %tmp107 = arith.mulf %tmp104_97, %tmp63_39 : tensor<2x64xf32> loc(#loc291) + %tmp109 = arith.mulf %tmp95, %tmp66_41 : tensor<2x64xf32> loc(#loc292) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<2x64xf32> loc(#loc293) + %0 = arith.muli %xindex_16, %cst_8 : tensor<2x1xi32> loc(#loc142) + %1 = tt.broadcast %0 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc143) + %2 = arith.addi %tmp50_21, %1 : tensor<2x64xi32> loc(#loc143) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc144) + %4 = tt.addptr %3, %2 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc144) + %5 = arith.truncf %tmp68 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc145) + tt.store %4, %5, %tmp50_29 : tensor<2x64x!tt.ptr> loc(#loc145) + %6 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<2x64x!tt.ptr> loc(#loc146) + %7 = tt.addptr %6, %2 : tensor<2x64x!tt.ptr>, tensor<2x64xi32> loc(#loc146) + %8 = arith.truncf %tmp110 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc147) + tt.store %7, %8, %tmp50_29 : tensor<2x64x!tt.ptr> loc(#loc147) + } loc(#loc40) + tt.return loc(#loc148) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc158 = loc("xoffset"(#loc2)) +#loc159 = loc("xoffset"(#loc3)) +#loc160 = loc("xindex"(#loc4)) +#loc161 = loc("xindex"(#loc5)) +#loc162 = loc("xindex"(#loc6)) +#loc163 = loc("r0_base"(#loc7)) +#loc164 = loc("r0_base"(#loc8)) +#loc165 = loc("x0"(#loc9)) +#loc166 = loc("x1"(#loc10)) +#loc167 = loc("_tmp4"(#loc11)) +#loc168 = loc("r0_index"(#loc12)) +#loc169 = loc("r0_mask"(#loc13)) +#loc170 = loc("tmp0"(#loc14)) +#loc171 = loc("tmp0"(#loc15)) +#loc172 = loc("tmp0"(#loc16)) +#loc173 = loc("tmp0"(#loc17)) +#loc174 = loc("tmp0"(#loc18)) +#loc175 = loc("tmp0"(#loc19)) +#loc176 = loc("tmp0"(#loc20)) +#loc177 = loc("tmp0"(#loc21)) +#loc178 = loc("tmp6"(#loc22)) +#loc179 = loc("tmp6"(#loc23)) +#loc180 = loc("tmp6"(#loc24)) +#loc181 = loc("tmp6"(#loc25)) +#loc182 = loc("tmp6"(#loc26)) +#loc183 = loc("tmp2"(#loc27)) +#loc184 = loc("tmp5"(#loc28)) +#loc185 = loc("_tmp4"(#loc29)) +#loc186 = loc("tmp8"(#loc30)) +#loc187 = loc("tmp11"(#loc31)) +#loc188 = loc("_tmp10"(#loc32)) +#loc190 = loc("tmp4"(#loc37)) +#loc192 = loc("tmp10"(#loc39)) +#loc193 = loc("r0_index"(#loc41)) +#loc194 = loc("r0_mask"(#loc42)) +#loc195 = loc("r0_3"(#loc43)) +#loc196 = loc("r0_4"(#loc44)) +#loc197 = loc("tmp50"(#loc45)) +#loc198 = loc("tmp50"(#loc46)) +#loc199 = loc("tmp50"(#loc47)) +#loc200 = loc("tmp50"(#loc48)) +#loc201 = loc("tmp50"(#loc49)) +#loc202 = loc("tmp50"(#loc50)) +#loc203 = loc("tmp50"(#loc51)) +#loc204 = loc("tmp58"(#loc52)) +#loc205 = loc("tmp58"(#loc53)) +#loc206 = loc("tmp58"(#loc54)) +#loc207 = loc("tmp63"(#loc55)) +#loc208 = loc("tmp63"(#loc56)) +#loc209 = loc("tmp63"(#loc57)) +#loc210 = loc("tmp63"(#loc58)) +#loc211 = loc("tmp66"(#loc59)) +#loc212 = loc("tmp66"(#loc60)) +#loc213 = loc("tmp96"(#loc61)) +#loc214 = loc("tmp96"(#loc62)) +#loc215 = loc("tmp96"(#loc63)) +#loc216 = loc("tmp96"(#loc64)) +#loc217 = loc("tmp96"(#loc65)) +#loc218 = loc("tmp96"(#loc66)) +#loc219 = loc("tmp102"(#loc67)) +#loc220 = loc("tmp102"(#loc68)) +#loc221 = loc("tmp102"(#loc69)) +#loc222 = loc("tmp16"(#loc70)) +#loc223 = loc("tmp17"(#loc71)) +#loc224 = loc("tmp17"(#loc72)) +#loc225 = loc("tmp17"(#loc73)) +#loc226 = loc("tmp17"(#loc74)) +#loc227 = loc("tmp17"(#loc75)) +#loc228 = loc("tmp17"(#loc76)) +#loc229 = loc("tmp17"(#loc77)) +#loc230 = loc("tmp17"(#loc78)) +#loc231 = loc("tmp20"(#loc79)) +#loc232 = loc("tmp22"(#loc80)) +#loc233 = loc("tmp23"(#loc81)) +#loc234 = loc("tmp24"(#loc82)) +#loc235 = loc("tmp25"(#loc83)) +#loc236 = loc("tmp25"(#loc84)) +#loc237 = loc("tmp25"(#loc85)) +#loc238 = loc("tmp27"(#loc86)) +#loc239 = loc("tmp29"(#loc87)) +#loc240 = loc("tmp31"(#loc88)) +#loc241 = loc("tmp32"(#loc89)) +#loc242 = loc("tmp35"(#loc90)) +#loc243 = loc("tmp35"(#loc91)) +#loc244 = loc("tmp35"(#loc92)) +#loc245 = loc("tmp35"(#loc93)) +#loc246 = loc("tmp35"(#loc94)) +#loc247 = loc("tmp35"(#loc95)) +#loc248 = loc("tmp42"(#loc96)) +#loc249 = loc("tmp43"(#loc97)) +#loc250 = loc("tmp43"(#loc98)) +#loc251 = loc("tmp43"(#loc99)) +#loc252 = loc("tmp45"(#loc100)) +#loc253 = loc("tmp48"(#loc101)) +#loc254 = loc("tmp49"(#loc102)) +#loc255 = loc("tmp57"(#loc103)) +#loc256 = loc("tmp60"(#loc104)) +#loc257 = loc("tmp64"(#loc105)) +#loc258 = loc("tmp67"(#loc106)) +#loc259 = loc("tmp68"(#loc107)) +#loc260 = loc("tmp70"(#loc108)) +#loc261 = loc("tmp70"(#loc109)) +#loc262 = loc("tmp70"(#loc110)) +#loc263 = loc("tmp70"(#loc111)) +#loc264 = loc("tmp70"(#loc112)) +#loc265 = loc("tmp70"(#loc113)) +#loc266 = loc("tmp72"(#loc114)) +#loc267 = loc("tmp73"(#loc115)) +#loc268 = loc("tmp74"(#loc116)) +#loc269 = loc("tmp75"(#loc117)) +#loc270 = loc("tmp76"(#loc118)) +#loc271 = loc("tmp76"(#loc119)) +#loc272 = loc("tmp76"(#loc120)) +#loc273 = loc("tmp78"(#loc121)) +#loc274 = loc("tmp80"(#loc122)) +#loc275 = loc("tmp82"(#loc123)) +#loc276 = loc("tmp83"(#loc124)) +#loc277 = loc("tmp83"(#loc125)) +#loc278 = loc("tmp83"(#loc126)) +#loc279 = loc("tmp83"(#loc127)) +#loc280 = loc("tmp83"(#loc128)) +#loc281 = loc("tmp83"(#loc129)) +#loc282 = loc("tmp88"(#loc130)) +#loc283 = loc("tmp89"(#loc131)) +#loc284 = loc("tmp89"(#loc132)) +#loc285 = loc("tmp89"(#loc133)) +#loc286 = loc("tmp91"(#loc134)) +#loc287 = loc("tmp94"(#loc135)) +#loc288 = loc("tmp95"(#loc136)) +#loc289 = loc("tmp101"(#loc137)) +#loc290 = loc("tmp104"(#loc138)) +#loc291 = loc("tmp107"(#loc139)) +#loc292 = loc("tmp109"(#loc140)) +#loc293 = loc("tmp110"(#loc141)) +#loc294 = loc("_tmp10"(#loc167)) +#loc295 = loc(callsite(#loc34 at #loc189)) +#loc297 = loc(callsite(#loc34 at #loc191)) +#loc299 = loc(callsite(#loc36 at #loc295)) +#loc300 = loc(callsite(#loc36 at #loc297)) diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..92e578f1c5f962bc9b14ba31e6bda19673b3836f --- /dev/null +++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}} \ No newline at end of file diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..50613ea92434bf443300cf92cc99793541e0dc29 Binary files /dev/null and b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9acf5c43ff4b9a8c8315b0234f85b374fcaaa13f --- /dev/null +++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"hash": "f73d68b5df2b1383295ef14f9fa5efb7b149f5fdafa2fc82a456a733f43cda44", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"} \ No newline at end of file diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..e3c893d58b1434c047a64ce0afb513da30de95ce --- /dev/null +++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir @@ -0,0 +1,865 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %13 = shl i32 %12, 3, !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %15 = and i32 %14, 224, !dbg !10 + %16 = lshr exact i32 %15, 5, !dbg !10 + %17 = and i32 %14, 7, !dbg !10 + %18 = or disjoint i32 %16, %13, !dbg !11 + %19 = or disjoint i32 %13, %17, !dbg !11 + %20 = and i32 %14, 31, !dbg !12 + %21 = shl nuw nsw i32 %20, 2, !dbg !12 + %22 = lshr i32 %14, 3, !dbg !12 + %23 = sdiv i32 %18, 32, !dbg !13 + %24 = mul i32 %23, 32, !dbg !14 + %.decomposed = sub i32 %18, %24, !dbg !14 + %25 = sdiv i32 %19, 32, !dbg !13 + %26 = or disjoint i32 %21, 4096, !dbg !15 + %27 = shl nsw i32 %.decomposed, 7, !dbg !16 + %28 = add nsw i32 %26, %27, !dbg !17 + %29 = mul i32 %23, 36864, !dbg !18 + %30 = add i32 %28, %29, !dbg !19 + %31 = sext i32 %30 to i64, !dbg !20 + %32 = getelementptr bfloat, ptr addrspace(1) %2, i64 %31, !dbg !20 + %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %34 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %32, i64 %33, i1 true) #6, !dbg !21 + %35 = extractvalue { i32, i32 } %34, 0, !dbg !21 + %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !21 + %37 = extractvalue { i32, i32 } %34, 1, !dbg !21 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !21 + %39 = extractelement <2 x bfloat> %36, i64 0, !dbg !21 + %40 = extractelement <2 x bfloat> %36, i64 1, !dbg !21 + %41 = extractelement <2 x bfloat> %38, i64 0, !dbg !21 + %42 = extractelement <2 x bfloat> %38, i64 1, !dbg !21 + %43 = fpext bfloat %39 to float, !dbg !22 + %44 = fpext bfloat %40 to float, !dbg !22 + %45 = fpext bfloat %41 to float, !dbg !22 + %46 = fpext bfloat %42 to float, !dbg !22 + %47 = or disjoint i32 %27, %21, !dbg !23 + %48 = add i32 %47, %29, !dbg !24 + %49 = sext i32 %48 to i64, !dbg !25 + %50 = getelementptr bfloat, ptr addrspace(1) %2, i64 %49, !dbg !25 + %51 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !26 + %52 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %50, i64 %51, i1 true) #6, !dbg !26 + %53 = extractvalue { i32, i32 } %52, 0, !dbg !26 + %54 = bitcast i32 %53 to <2 x bfloat>, !dbg !26 + %55 = extractvalue { i32, i32 } %52, 1, !dbg !26 + %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !26 + %57 = extractelement <2 x bfloat> %54, i64 0, !dbg !26 + %58 = extractelement <2 x bfloat> %54, i64 1, !dbg !26 + %59 = extractelement <2 x bfloat> %56, i64 0, !dbg !26 + %60 = extractelement <2 x bfloat> %56, i64 1, !dbg !26 + %61 = fpext bfloat %57 to float, !dbg !27 + %62 = fpext bfloat %58 to float, !dbg !27 + %63 = fpext bfloat %59 to float, !dbg !27 + %64 = fpext bfloat %60 to float, !dbg !27 + %65 = fmul float %43, %43, !dbg !28 + %66 = fmul float %44, %44, !dbg !28 + %67 = fmul float %45, %45, !dbg !28 + %68 = fmul float %46, %46, !dbg !28 + %69 = fmul float %61, %61, !dbg !29 + %70 = fmul float %62, %62, !dbg !29 + %71 = fmul float %63, %63, !dbg !29 + %72 = fmul float %64, %64, !dbg !29 + %73 = fadd float %65, %66, !dbg !30 + %74 = fadd float %67, %73, !dbg !30 + %75 = fadd float %68, %74, !dbg !30 + %76 = bitcast float %75 to i32, !dbg !33 + %77 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %76, i32 16, i32 31), !dbg !33 + %78 = bitcast i32 %77 to float, !dbg !33 + %79 = fadd float %75, %78, !dbg !30 + %80 = bitcast float %79 to i32, !dbg !33 + %81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %80, i32 8, i32 31), !dbg !33 + %82 = bitcast i32 %81 to float, !dbg !33 + %83 = fadd float %79, %82, !dbg !30 + %84 = bitcast float %83 to i32, !dbg !33 + %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 4, i32 31), !dbg !33 + %86 = bitcast i32 %85 to float, !dbg !33 + %87 = fadd float %83, %86, !dbg !30 + %88 = bitcast float %87 to i32, !dbg !33 + %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !33 + %90 = bitcast i32 %89 to float, !dbg !33 + %91 = fadd float %87, %90, !dbg !30 + %92 = bitcast float %91 to i32, !dbg !33 + %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 1, i32 31), !dbg !33 + %94 = bitcast i32 %93 to float, !dbg !33 + %95 = fadd float %91, %94, !dbg !30 + %96 = fadd float %69, %70, !dbg !36 + %97 = fadd float %71, %96, !dbg !36 + %98 = fadd float %72, %97, !dbg !36 + %99 = bitcast float %98 to i32, !dbg !37 + %100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 16, i32 31), !dbg !37 + %101 = bitcast i32 %100 to float, !dbg !37 + %102 = fadd float %98, %101, !dbg !36 + %103 = bitcast float %102 to i32, !dbg !37 + %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 8, i32 31), !dbg !37 + %105 = bitcast i32 %104 to float, !dbg !37 + %106 = fadd float %102, %105, !dbg !36 + %107 = bitcast float %106 to i32, !dbg !37 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 4, i32 31), !dbg !37 + %109 = bitcast i32 %108 to float, !dbg !37 + %110 = fadd float %106, %109, !dbg !36 + %111 = bitcast float %110 to i32, !dbg !37 + %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 2, i32 31), !dbg !37 + %113 = bitcast i32 %112 to float, !dbg !37 + %114 = fadd float %110, %113, !dbg !36 + %115 = bitcast float %114 to i32, !dbg !37 + %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 1, i32 31), !dbg !37 + %117 = bitcast i32 %116 to float, !dbg !37 + %118 = fadd float %114, %117, !dbg !36 + %119 = and i32 %22, 1, !dbg !39 + %120 = zext nneg i32 %21 to i64, !dbg !40 + %121 = getelementptr bfloat, ptr addrspace(1) %3, i64 %120, !dbg !40 + %122 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41 + %123 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %121, i64 %122, i1 true) #6, !dbg !41 + %124 = extractvalue { i32, i32 } %123, 0, !dbg !41 + %125 = bitcast i32 %124 to <2 x bfloat>, !dbg !41 + %126 = extractvalue { i32, i32 } %123, 1, !dbg !41 + %127 = bitcast i32 %126 to <2 x bfloat>, !dbg !41 + %128 = extractelement <2 x bfloat> %125, i64 0, !dbg !41 + %129 = extractelement <2 x bfloat> %125, i64 1, !dbg !41 + %130 = extractelement <2 x bfloat> %127, i64 0, !dbg !41 + %131 = extractelement <2 x bfloat> %127, i64 1, !dbg !41 + %132 = fpext bfloat %128 to float, !dbg !42 + %133 = fpext bfloat %129 to float, !dbg !42 + %134 = fpext bfloat %130 to float, !dbg !42 + %135 = fpext bfloat %131 to float, !dbg !42 + %136 = shl i32 %23, 7, !dbg !43 + %137 = or disjoint i32 %136, %21, !dbg !44 + %138 = sext i32 %137 to i64, !dbg !45 + %139 = getelementptr float, ptr addrspace(1) %4, i64 %138, !dbg !45 + %140 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !46 + %141 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %139, i64 %140, i1 true) #6, !dbg !46 + %142 = extractvalue { i32, i32, i32, i32 } %141, 0, !dbg !46 + %143 = extractvalue { i32, i32, i32, i32 } %141, 1, !dbg !46 + %144 = extractvalue { i32, i32, i32, i32 } %141, 2, !dbg !46 + %145 = extractvalue { i32, i32, i32, i32 } %141, 3, !dbg !46 + %146 = bitcast i32 %142 to float, !dbg !46 + %147 = bitcast i32 %143 to float, !dbg !46 + %148 = bitcast i32 %144 to float, !dbg !46 + %149 = bitcast i32 %145 to float, !dbg !46 + %150 = getelementptr float, ptr addrspace(1) %5, i64 %138, !dbg !47 + %151 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48 + %152 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %150, i64 %151, i1 true) #6, !dbg !48 + %153 = extractvalue { i32, i32, i32, i32 } %152, 0, !dbg !48 + %154 = extractvalue { i32, i32, i32, i32 } %152, 1, !dbg !48 + %155 = extractvalue { i32, i32, i32, i32 } %152, 2, !dbg !48 + %156 = extractvalue { i32, i32, i32, i32 } %152, 3, !dbg !48 + %157 = shl nuw nsw i32 %17, 4, !dbg !48 + %158 = shl nuw nsw i32 %15, 2, !dbg !48 + %159 = lshr i32 %14, 1, !dbg !48 + %160 = and i32 %159, 124, !dbg !48 + %161 = or disjoint i32 %157, %158, !dbg !48 + %162 = xor i32 %161, %160, !dbg !48 + %163 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %162, !dbg !48 + %164 = insertelement <1 x i32> poison, i32 %153, i64 0, !dbg !48 + store <1 x i32> %164, ptr addrspace(3) %163, align 4, !dbg !48 + %165 = xor i32 %162, 1028, !dbg !48 + %166 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %165, !dbg !48 + %167 = insertelement <1 x i32> poison, i32 %154, i64 0, !dbg !48 + store <1 x i32> %167, ptr addrspace(3) %166, align 4, !dbg !48 + %168 = xor i32 %162, 2056, !dbg !48 + %169 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %168, !dbg !48 + %170 = insertelement <1 x i32> poison, i32 %155, i64 0, !dbg !48 + store <1 x i32> %170, ptr addrspace(3) %169, align 4, !dbg !48 + %171 = xor i32 %162, 3084, !dbg !48 + %172 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %171, !dbg !48 + %173 = insertelement <1 x i32> poison, i32 %156, i64 0, !dbg !48 + store <1 x i32> %173, ptr addrspace(3) %172, align 4, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %174 = shl nuw nsw i32 %20, 7, !dbg !48 + %175 = xor i32 %157, %160, !dbg !48 + %176 = or disjoint i32 %175, %174, !dbg !48 + %177 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %176, !dbg !48 + %178 = load float, ptr addrspace(3) %177, align 4, !dbg !48 + %179 = xor i32 %176, 4, !dbg !48 + %180 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %179, !dbg !48 + %181 = load float, ptr addrspace(3) %180, align 4, !dbg !48 + %182 = xor i32 %176, 8, !dbg !48 + %183 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %182, !dbg !48 + %184 = load float, ptr addrspace(3) %183, align 4, !dbg !48 + %185 = xor i32 %176, 12, !dbg !48 + %186 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %185, !dbg !48 + %187 = load float, ptr addrspace(3) %186, align 4, !dbg !48 + %188 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %189 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %32, i64 %188, i1 true) #6, !dbg !49 + %190 = getelementptr bfloat, ptr addrspace(1) %6, i64 %120, !dbg !50 + %191 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %192 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %190, i64 %191, i1 true) #6, !dbg !51 + %193 = icmp eq i32 %119, 0, !dbg !52 + %194 = and i32 %22, 30, !dbg !53 + %195 = or disjoint i32 %194, 32, !dbg !53 + %196 = or disjoint i32 %194, 64, !dbg !53 + %197 = or disjoint i32 %194, 96, !dbg !53 + %198 = or disjoint i32 %194, 1, !dbg !54 + %199 = or disjoint i32 %194, 33, !dbg !54 + %200 = or disjoint i32 %194, 65, !dbg !54 + %201 = or i32 %22, 97, !dbg !54 + %202 = shl i32 %19, 7, !dbg !55 + %203 = shl i32 %25, 15, !dbg !55 + %204 = add i32 %203, %202, !dbg !55 + %205 = or disjoint i32 %204, %198, !dbg !56 + %206 = or disjoint i32 %204, %199, !dbg !56 + %207 = or disjoint i32 %204, %200, !dbg !56 + %208 = or disjoint i32 %204, %201, !dbg !56 + %209 = sext i32 %205 to i64, !dbg !57 + %210 = getelementptr bfloat, ptr addrspace(1) %2, i64 %209, !dbg !57 + %211 = sext i32 %206 to i64, !dbg !57 + %212 = getelementptr bfloat, ptr addrspace(1) %2, i64 %211, !dbg !57 + %213 = sext i32 %207 to i64, !dbg !57 + %214 = getelementptr bfloat, ptr addrspace(1) %2, i64 %213, !dbg !57 + %215 = sext i32 %208 to i64, !dbg !57 + %216 = getelementptr bfloat, ptr addrspace(1) %2, i64 %215, !dbg !57 + %217 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %218 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %210, i64 %217, i1 %193) #6, !dbg !58 + %219 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %220 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %212, i64 %219, i1 %193) #6, !dbg !58 + %221 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %222 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %214, i64 %221, i1 %193) #6, !dbg !58 + %223 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %224 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %216, i64 %223, i1 %193) #6, !dbg !58 + %225 = tail call float @llvm.nvvm.div.full(float %118, float 1.280000e+02), !dbg !59 + %226 = fadd float %225, 0x3EB0C6F7A0000000, !dbg !60 + %227 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not.i = icmp eq i32 %227, 0, !dbg !61 + br i1 %.not.i, label %230, label %228, !dbg !61 + +228: ; preds = %11 + %229 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %226), !dbg !61 + br label %__nv_rsqrtf.exit, !dbg !61 + +230: ; preds = %11 + %231 = tail call float @llvm.nvvm.rsqrt.approx.f(float %226), !dbg !61 + br label %__nv_rsqrtf.exit, !dbg !61 + +__nv_rsqrtf.exit: ; preds = %228, %230 + %.0.i = phi float [ %229, %228 ], [ %231, %230 ], !dbg !61 + %232 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %233 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %234 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not.i7 = icmp eq i32 %234, 0, !dbg !61 + br i1 %.not.i7, label %237, label %235, !dbg !61 + +235: ; preds = %__nv_rsqrtf.exit + %236 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %226), !dbg !61 + br label %__nv_rsqrtf.exit9, !dbg !61 + +237: ; preds = %__nv_rsqrtf.exit + %238 = tail call float @llvm.nvvm.rsqrt.approx.f(float %226), !dbg !61 + br label %__nv_rsqrtf.exit9, !dbg !61 + +__nv_rsqrtf.exit9: ; preds = %235, %237 + %.0.i8 = phi float [ %236, %235 ], [ %238, %237 ], !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62 + %239 = lshr exact i32 %15, 3, !dbg !62 + %240 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %239, !dbg !62 + store float %.0.i, ptr addrspace(3) %240, align 4, !dbg !62 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62 + %241 = shl nuw nsw i32 %17, 2, !dbg !62 + %242 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %241, !dbg !62 + %243 = load float, ptr addrspace(3) %242, align 4, !dbg !62 + %244 = zext nneg i32 %198 to i64, !dbg !63 + %245 = getelementptr bfloat, ptr addrspace(1) %3, i64 %244, !dbg !63 + %246 = zext nneg i32 %199 to i64, !dbg !63 + %247 = getelementptr bfloat, ptr addrspace(1) %3, i64 %246, !dbg !63 + %248 = zext nneg i32 %200 to i64, !dbg !63 + %249 = getelementptr bfloat, ptr addrspace(1) %3, i64 %248, !dbg !63 + %250 = zext nneg i32 %201 to i64, !dbg !63 + %251 = getelementptr bfloat, ptr addrspace(1) %3, i64 %250, !dbg !63 + %252 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %253 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %245, i64 %252, i1 %193) #6, !dbg !64 + %254 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %255 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %247, i64 %254, i1 %193) #6, !dbg !64 + %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %257 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %249, i64 %256, i1 %193) #6, !dbg !64 + %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %251, i64 %258, i1 %193) #6, !dbg !64 + %260 = icmp ne i32 %119, 0, !dbg !65 + %261 = or disjoint i32 %204, %194, !dbg !66 + %262 = or disjoint i32 %204, %195, !dbg !66 + %263 = or disjoint i32 %204, %196, !dbg !66 + %264 = or disjoint i32 %204, %197, !dbg !66 + %265 = sext i32 %261 to i64, !dbg !67 + %266 = getelementptr bfloat, ptr addrspace(1) %2, i64 %265, !dbg !67 + %267 = sext i32 %262 to i64, !dbg !67 + %268 = getelementptr bfloat, ptr addrspace(1) %2, i64 %267, !dbg !67 + %269 = sext i32 %263 to i64, !dbg !67 + %270 = getelementptr bfloat, ptr addrspace(1) %2, i64 %269, !dbg !67 + %271 = sext i32 %264 to i64, !dbg !67 + %272 = getelementptr bfloat, ptr addrspace(1) %2, i64 %271, !dbg !67 + %273 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %274 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %266, i64 %273, i1 %260) #6, !dbg !68 + %275 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %276 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %268, i64 %275, i1 %260) #6, !dbg !68 + %277 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %278 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %270, i64 %277, i1 %260) #6, !dbg !68 + %279 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %280 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %272, i64 %279, i1 %260) #6, !dbg !68 + %281 = zext nneg i32 %194 to i64, !dbg !69 + %282 = getelementptr bfloat, ptr addrspace(1) %3, i64 %281, !dbg !69 + %283 = zext nneg i32 %195 to i64, !dbg !69 + %284 = getelementptr bfloat, ptr addrspace(1) %3, i64 %283, !dbg !69 + %285 = zext nneg i32 %196 to i64, !dbg !69 + %286 = getelementptr bfloat, ptr addrspace(1) %3, i64 %285, !dbg !69 + %287 = zext nneg i32 %197 to i64, !dbg !69 + %288 = getelementptr bfloat, ptr addrspace(1) %3, i64 %287, !dbg !69 + %289 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %290 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %282, i64 %289, i1 %260) #6, !dbg !70 + %291 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %292 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %284, i64 %291, i1 %260) #6, !dbg !70 + %293 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %293, i1 %260) #6, !dbg !70 + %295 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %296 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %295, i1 %260) #6, !dbg !70 + %297 = fmul float %.0.i8, %61, !dbg !71 + %298 = fmul float %.0.i8, %62, !dbg !71 + %299 = fmul float %.0.i8, %63, !dbg !71 + %300 = fmul float %.0.i8, %64, !dbg !71 + %301 = fmul float %297, %132, !dbg !72 + %302 = fmul float %298, %133, !dbg !72 + %303 = fmul float %299, %134, !dbg !72 + %304 = fmul float %300, %135, !dbg !72 + %305 = fmul float %301, %146, !dbg !73 + %306 = fmul float %302, %147, !dbg !73 + %307 = fmul float %303, %148, !dbg !73 + %308 = fmul float %304, %149, !dbg !73 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73 + store float %305, ptr addrspace(3) %163, align 4, !dbg !73 + store float %306, ptr addrspace(3) %166, align 4, !dbg !73 + store float %307, ptr addrspace(3) %169, align 4, !dbg !73 + store float %308, ptr addrspace(3) %172, align 4, !dbg !73 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73 + %309 = load float, ptr addrspace(3) %177, align 4, !dbg !73 + %310 = load float, ptr addrspace(3) %180, align 4, !dbg !73 + %311 = load float, ptr addrspace(3) %183, align 4, !dbg !73 + %312 = load float, ptr addrspace(3) %186, align 4, !dbg !73 + %313 = or i32 %22, 4193, !dbg !74 + %314 = add i32 %204, 4097, !dbg !75 + %315 = or disjoint i32 %314, %194, !dbg !76 + %316 = add i32 %204, 4129, !dbg !75 + %317 = or disjoint i32 %316, %194, !dbg !76 + %318 = add i32 %204, 4161, !dbg !75 + %319 = or disjoint i32 %318, %194, !dbg !76 + %320 = add i32 %204, %313, !dbg !76 + %321 = sext i32 %315 to i64, !dbg !77 + %322 = getelementptr bfloat, ptr addrspace(1) %2, i64 %321, !dbg !77 + %323 = sext i32 %317 to i64, !dbg !77 + %324 = getelementptr bfloat, ptr addrspace(1) %2, i64 %323, !dbg !77 + %325 = sext i32 %319 to i64, !dbg !77 + %326 = getelementptr bfloat, ptr addrspace(1) %2, i64 %325, !dbg !77 + %327 = sext i32 %320 to i64, !dbg !77 + %328 = getelementptr bfloat, ptr addrspace(1) %2, i64 %327, !dbg !77 + %329 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !78 + %330 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %322, i64 %329, i1 %193) #6, !dbg !78 + %331 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !78 + %332 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %324, i64 %331, i1 %193) #6, !dbg !78 + %333 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !78 + %334 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %326, i64 %333, i1 %193) #6, !dbg !78 + %335 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !78 + %336 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %328, i64 %335, i1 %193) #6, !dbg !78 + %337 = tail call float @llvm.nvvm.div.full(float %95, float 1.280000e+02), !dbg !79 + %338 = fadd float %337, 0x3EB0C6F7A0000000, !dbg !80 + %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !81 + %.not.i10 = icmp eq i32 %339, 0, !dbg !81 + br i1 %.not.i10, label %342, label %340, !dbg !81 + +340: ; preds = %__nv_rsqrtf.exit9 + %341 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !81 + br label %__nv_rsqrtf.exit12, !dbg !81 + +342: ; preds = %__nv_rsqrtf.exit9 + %343 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !81 + br label %__nv_rsqrtf.exit12, !dbg !81 + +__nv_rsqrtf.exit12: ; preds = %340, %342 + %.0.i11 = phi float [ %341, %340 ], [ %343, %342 ], !dbg !81 + %344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !81 + %345 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !81 + %346 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !81 + %.not.i19 = icmp eq i32 %346, 0, !dbg !81 + br i1 %.not.i19, label %349, label %347, !dbg !81 + +347: ; preds = %__nv_rsqrtf.exit12 + %348 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !81 + br label %__nv_rsqrtf.exit21, !dbg !81 + +349: ; preds = %__nv_rsqrtf.exit12 + %350 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !81 + br label %__nv_rsqrtf.exit21, !dbg !81 + +__nv_rsqrtf.exit21: ; preds = %347, %349 + %.0.i20 = phi float [ %348, %347 ], [ %350, %349 ], !dbg !81 + %351 = bitcast i16 %336 to bfloat, !dbg !78 + %352 = fpext bfloat %351 to float, !dbg !82 + %353 = bitcast i16 %334 to bfloat, !dbg !78 + %354 = fpext bfloat %353 to float, !dbg !82 + %355 = bitcast i16 %332 to bfloat, !dbg !78 + %356 = fpext bfloat %355 to float, !dbg !82 + %357 = bitcast i16 %330 to bfloat, !dbg !78 + %358 = fpext bfloat %357 to float, !dbg !82 + %359 = bitcast i16 %224 to bfloat, !dbg !58 + %360 = fpext bfloat %359 to float, !dbg !83 + %361 = fmul float %243, %360, !dbg !62 + %362 = bitcast i16 %259 to bfloat, !dbg !64 + %363 = fpext bfloat %362 to float, !dbg !84 + %364 = fmul float %361, %363, !dbg !85 + %365 = fsub float 0.000000e+00, %364, !dbg !86 + %366 = bitcast i16 %280 to bfloat, !dbg !68 + %367 = fpext bfloat %366 to float, !dbg !87 + %368 = fmul float %243, %367, !dbg !88 + %369 = bitcast i16 %296 to bfloat, !dbg !70 + %370 = fpext bfloat %369 to float, !dbg !89 + %371 = fmul float %368, %370, !dbg !90 + %372 = select i1 %193, float %365, float %371, !dbg !91 + %373 = fmul float %187, %372, !dbg !92 + %374 = fadd float %373, %312, !dbg !93 + %375 = bitcast i16 %222 to bfloat, !dbg !58 + %376 = fpext bfloat %375 to float, !dbg !83 + %377 = fmul float %243, %376, !dbg !62 + %378 = bitcast i16 %257 to bfloat, !dbg !64 + %379 = fpext bfloat %378 to float, !dbg !84 + %380 = fmul float %377, %379, !dbg !85 + %381 = fsub float 0.000000e+00, %380, !dbg !86 + %382 = bitcast i16 %278 to bfloat, !dbg !68 + %383 = fpext bfloat %382 to float, !dbg !87 + %384 = fmul float %243, %383, !dbg !88 + %385 = bitcast i16 %294 to bfloat, !dbg !70 + %386 = fpext bfloat %385 to float, !dbg !89 + %387 = fmul float %384, %386, !dbg !90 + %388 = select i1 %193, float %381, float %387, !dbg !91 + %389 = fmul float %184, %388, !dbg !92 + %390 = fadd float %389, %311, !dbg !93 + %391 = bitcast i16 %220 to bfloat, !dbg !58 + %392 = fpext bfloat %391 to float, !dbg !83 + %393 = fmul float %243, %392, !dbg !62 + %394 = bitcast i16 %255 to bfloat, !dbg !64 + %395 = fpext bfloat %394 to float, !dbg !84 + %396 = fmul float %393, %395, !dbg !85 + %397 = fsub float 0.000000e+00, %396, !dbg !86 + %398 = bitcast i16 %276 to bfloat, !dbg !68 + %399 = fpext bfloat %398 to float, !dbg !87 + %400 = fmul float %243, %399, !dbg !88 + %401 = bitcast i16 %292 to bfloat, !dbg !70 + %402 = fpext bfloat %401 to float, !dbg !89 + %403 = fmul float %400, %402, !dbg !90 + %404 = select i1 %193, float %397, float %403, !dbg !91 + %405 = fmul float %181, %404, !dbg !92 + %406 = fadd float %405, %310, !dbg !93 + %407 = bitcast i16 %218 to bfloat, !dbg !58 + %408 = fpext bfloat %407 to float, !dbg !83 + %409 = fmul float %243, %408, !dbg !62 + %410 = bitcast i16 %253 to bfloat, !dbg !64 + %411 = fpext bfloat %410 to float, !dbg !84 + %412 = fmul float %409, %411, !dbg !85 + %413 = fsub float 0.000000e+00, %412, !dbg !86 + %414 = bitcast i16 %274 to bfloat, !dbg !68 + %415 = fpext bfloat %414 to float, !dbg !87 + %416 = fmul float %243, %415, !dbg !88 + %417 = bitcast i16 %290 to bfloat, !dbg !70 + %418 = fpext bfloat %417 to float, !dbg !89 + %419 = fmul float %416, %418, !dbg !90 + %420 = select i1 %193, float %413, float %419, !dbg !91 + %421 = fmul float %178, %420, !dbg !92 + %422 = fadd float %421, %309, !dbg !93 + %423 = extractvalue { i32, i32 } %192, 1, !dbg !51 + %424 = bitcast i32 %423 to <2 x bfloat>, !dbg !51 + %425 = extractelement <2 x bfloat> %424, i64 1, !dbg !51 + %426 = fpext bfloat %425 to float, !dbg !94 + %427 = extractelement <2 x bfloat> %424, i64 0, !dbg !51 + %428 = fpext bfloat %427 to float, !dbg !94 + %429 = extractvalue { i32, i32 } %192, 0, !dbg !51 + %430 = bitcast i32 %429 to <2 x bfloat>, !dbg !51 + %431 = extractelement <2 x bfloat> %430, i64 1, !dbg !51 + %432 = fpext bfloat %431 to float, !dbg !94 + %433 = extractelement <2 x bfloat> %430, i64 0, !dbg !51 + %434 = fpext bfloat %433 to float, !dbg !94 + %435 = extractvalue { i32, i32 } %189, 1, !dbg !49 + %436 = bitcast i32 %435 to <2 x bfloat>, !dbg !49 + %437 = extractelement <2 x bfloat> %436, i64 1, !dbg !49 + %438 = fpext bfloat %437 to float, !dbg !95 + %439 = extractelement <2 x bfloat> %436, i64 0, !dbg !49 + %440 = fpext bfloat %439 to float, !dbg !95 + %441 = extractvalue { i32, i32 } %189, 0, !dbg !49 + %442 = bitcast i32 %441 to <2 x bfloat>, !dbg !49 + %443 = extractelement <2 x bfloat> %442, i64 1, !dbg !49 + %444 = fpext bfloat %443 to float, !dbg !95 + %445 = extractelement <2 x bfloat> %442, i64 0, !dbg !49 + %446 = fpext bfloat %445 to float, !dbg !95 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !96 + store float %.0.i11, ptr addrspace(3) %240, align 4, !dbg !96 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !96 + %447 = load float, ptr addrspace(3) %242, align 4, !dbg !96 + %448 = fmul float %447, %358, !dbg !96 + %449 = fmul float %447, %356, !dbg !96 + %450 = fmul float %447, %354, !dbg !96 + %451 = fmul float %447, %352, !dbg !96 + %452 = getelementptr bfloat, ptr addrspace(1) %6, i64 %244, !dbg !97 + %453 = getelementptr bfloat, ptr addrspace(1) %6, i64 %246, !dbg !97 + %454 = getelementptr bfloat, ptr addrspace(1) %6, i64 %248, !dbg !97 + %455 = getelementptr bfloat, ptr addrspace(1) %6, i64 %250, !dbg !97 + %456 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !98 + %457 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %452, i64 %456, i1 %193) #6, !dbg !98 + %458 = bitcast i16 %457 to bfloat, !dbg !98 + %459 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !98 + %460 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %453, i64 %459, i1 %193) #6, !dbg !98 + %461 = bitcast i16 %460 to bfloat, !dbg !98 + %462 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !98 + %463 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %454, i64 %462, i1 %193) #6, !dbg !98 + %464 = bitcast i16 %463 to bfloat, !dbg !98 + %465 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !98 + %466 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %455, i64 %465, i1 %193) #6, !dbg !98 + %467 = bitcast i16 %466 to bfloat, !dbg !98 + %468 = fpext bfloat %458 to float, !dbg !99 + %469 = fpext bfloat %461 to float, !dbg !99 + %470 = fpext bfloat %464 to float, !dbg !99 + %471 = fpext bfloat %467 to float, !dbg !99 + %472 = fmul float %448, %468, !dbg !100 + %473 = fmul float %449, %469, !dbg !100 + %474 = fmul float %450, %470, !dbg !100 + %475 = fmul float %451, %471, !dbg !100 + %476 = fsub float 0.000000e+00, %472, !dbg !101 + %477 = fsub float 0.000000e+00, %473, !dbg !101 + %478 = fsub float 0.000000e+00, %474, !dbg !101 + %479 = fsub float 0.000000e+00, %475, !dbg !101 + %480 = add i32 %204, 4096, !dbg !102 + %481 = or disjoint i32 %480, %194, !dbg !103 + %482 = add i32 %204, 4128, !dbg !102 + %483 = or disjoint i32 %482, %194, !dbg !103 + %484 = add i32 %204, 4160, !dbg !102 + %485 = or disjoint i32 %484, %194, !dbg !103 + %486 = add i32 %204, 4192, !dbg !102 + %487 = or disjoint i32 %486, %194, !dbg !103 + %488 = sext i32 %481 to i64, !dbg !104 + %489 = getelementptr bfloat, ptr addrspace(1) %2, i64 %488, !dbg !104 + %490 = sext i32 %483 to i64, !dbg !104 + %491 = getelementptr bfloat, ptr addrspace(1) %2, i64 %490, !dbg !104 + %492 = sext i32 %485 to i64, !dbg !104 + %493 = getelementptr bfloat, ptr addrspace(1) %2, i64 %492, !dbg !104 + %494 = sext i32 %487 to i64, !dbg !104 + %495 = getelementptr bfloat, ptr addrspace(1) %2, i64 %494, !dbg !104 + %496 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %497 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %489, i64 %496, i1 %260) #6, !dbg !105 + %498 = bitcast i16 %497 to bfloat, !dbg !105 + %499 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %500 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %491, i64 %499, i1 %260) #6, !dbg !105 + %501 = bitcast i16 %500 to bfloat, !dbg !105 + %502 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %503 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %493, i64 %502, i1 %260) #6, !dbg !105 + %504 = bitcast i16 %503 to bfloat, !dbg !105 + %505 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %506 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %495, i64 %505, i1 %260) #6, !dbg !105 + %507 = bitcast i16 %506 to bfloat, !dbg !105 + %508 = fpext bfloat %498 to float, !dbg !106 + %509 = fpext bfloat %501 to float, !dbg !106 + %510 = fpext bfloat %504 to float, !dbg !106 + %511 = fpext bfloat %507 to float, !dbg !106 + %512 = fmul float %447, %508, !dbg !107 + %513 = fmul float %447, %509, !dbg !107 + %514 = fmul float %447, %510, !dbg !107 + %515 = fmul float %447, %511, !dbg !107 + %516 = getelementptr bfloat, ptr addrspace(1) %6, i64 %281, !dbg !108 + %517 = getelementptr bfloat, ptr addrspace(1) %6, i64 %283, !dbg !108 + %518 = getelementptr bfloat, ptr addrspace(1) %6, i64 %285, !dbg !108 + %519 = getelementptr bfloat, ptr addrspace(1) %6, i64 %287, !dbg !108 + %520 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %521 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %516, i64 %520, i1 %260) #6, !dbg !109 + %522 = bitcast i16 %521 to bfloat, !dbg !109 + %523 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %524 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %517, i64 %523, i1 %260) #6, !dbg !109 + %525 = bitcast i16 %524 to bfloat, !dbg !109 + %526 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %527 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %518, i64 %526, i1 %260) #6, !dbg !109 + %528 = bitcast i16 %527 to bfloat, !dbg !109 + %529 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %530 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %519, i64 %529, i1 %260) #6, !dbg !109 + %531 = bitcast i16 %530 to bfloat, !dbg !109 + %532 = fpext bfloat %522 to float, !dbg !110 + %533 = fpext bfloat %525 to float, !dbg !110 + %534 = fpext bfloat %528 to float, !dbg !110 + %535 = fpext bfloat %531 to float, !dbg !110 + %536 = fmul float %512, %532, !dbg !111 + %537 = fmul float %513, %533, !dbg !111 + %538 = fmul float %514, %534, !dbg !111 + %539 = fmul float %515, %535, !dbg !111 + %540 = select i1 %193, float %476, float %536, !dbg !91 + %541 = select i1 %193, float %477, float %537, !dbg !91 + %542 = select i1 %193, float %478, float %538, !dbg !91 + %543 = select i1 %193, float %479, float %539, !dbg !91 + %544 = fmul float %.0.i20, %446, !dbg !112 + %545 = fmul float %.0.i20, %444, !dbg !112 + %546 = fmul float %.0.i20, %440, !dbg !112 + %547 = fmul float %.0.i20, %438, !dbg !112 + %548 = fmul float %544, %434, !dbg !113 + %549 = fmul float %545, %432, !dbg !113 + %550 = fmul float %546, %428, !dbg !113 + %551 = fmul float %547, %426, !dbg !113 + %552 = fmul float %548, %146, !dbg !114 + %553 = fmul float %549, %147, !dbg !114 + %554 = fmul float %550, %148, !dbg !114 + %555 = fmul float %551, %149, !dbg !114 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114 + store float %552, ptr addrspace(3) %163, align 4, !dbg !114 + store float %553, ptr addrspace(3) %166, align 4, !dbg !114 + store float %554, ptr addrspace(3) %169, align 4, !dbg !114 + store float %555, ptr addrspace(3) %172, align 4, !dbg !114 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114 + %556 = load float, ptr addrspace(3) %177, align 4, !dbg !114 + %557 = load float, ptr addrspace(3) %180, align 4, !dbg !114 + %558 = load float, ptr addrspace(3) %183, align 4, !dbg !114 + %559 = load float, ptr addrspace(3) %186, align 4, !dbg !114 + %560 = fmul float %178, %540, !dbg !115 + %561 = fmul float %181, %541, !dbg !115 + %562 = fmul float %184, %542, !dbg !115 + %563 = fmul float %187, %543, !dbg !115 + %564 = fadd float %560, %556, !dbg !116 + %565 = fadd float %561, %557, !dbg !116 + %566 = fadd float %562, %558, !dbg !116 + %567 = fadd float %563, %559, !dbg !116 + %568 = shl i32 %18, 7, !dbg !117 + %569 = or disjoint i32 %568, %21, !dbg !118 + %570 = sext i32 %569 to i64, !dbg !119 + %571 = getelementptr bfloat, ptr addrspace(1) %0, i64 %570, !dbg !119 + %572 = fptrunc float %422 to bfloat, !dbg !120 + %573 = fptrunc float %406 to bfloat, !dbg !120 + %574 = fptrunc float %390 to bfloat, !dbg !120 + %575 = fptrunc float %374 to bfloat, !dbg !120 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !120 + %576 = and i32 %14, 15, !dbg !120 + %577 = shl nuw nsw i32 %576, 7, !dbg !120 + %578 = shl nuw nsw i32 %576, 3, !dbg !120 + %579 = and i32 %22, 24, !dbg !120 + %580 = lshr i32 %14, 2, !dbg !120 + %581 = and i32 %580, 4, !dbg !120 + %582 = lshr i32 %14, 4, !dbg !120 + %583 = and i32 %582, 2, !dbg !120 + %584 = or disjoint i32 %577, %581, !dbg !120 + %585 = or disjoint i32 %584, %583, !dbg !120 + %586 = xor i32 %578, %579, !dbg !120 + %587 = or disjoint i32 %585, %586, !dbg !120 + %588 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %587, !dbg !120 + store bfloat %572, ptr addrspace(3) %588, align 2, !dbg !120 + %589 = xor i32 %587, 32, !dbg !120 + %590 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %589, !dbg !120 + store bfloat %573, ptr addrspace(3) %590, align 2, !dbg !120 + %591 = xor i32 %587, 64, !dbg !120 + %592 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %591, !dbg !120 + store bfloat %574, ptr addrspace(3) %592, align 2, !dbg !120 + %593 = xor i32 %587, 96, !dbg !120 + %594 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %593, !dbg !120 + store bfloat %575, ptr addrspace(3) %594, align 2, !dbg !120 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !120 + %595 = shl nuw nsw i32 %14, 2, !dbg !120 + %596 = and i32 %595, 1016, !dbg !120 + %597 = lshr exact i32 %15, 2, !dbg !120 + %598 = shl nuw nsw i32 %14, 1, !dbg !120 + %599 = and i32 %598, 2, !dbg !120 + %600 = xor i32 %596, %597, !dbg !120 + %601 = or disjoint i32 %600, %599, !dbg !120 + %602 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %601, !dbg !120 + %603 = load bfloat, ptr addrspace(3) %602, align 2, !dbg !120 + %604 = getelementptr inbounds nuw i8, ptr addrspace(3) %602, i32 4, !dbg !120 + %605 = load bfloat, ptr addrspace(3) %604, align 2, !dbg !120 + %606 = xor i32 %601, 1088, !dbg !120 + %607 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %606, !dbg !120 + %608 = load bfloat, ptr addrspace(3) %607, align 2, !dbg !120 + %609 = getelementptr inbounds nuw i8, ptr addrspace(3) %607, i32 4, !dbg !120 + %610 = load bfloat, ptr addrspace(3) %609, align 2, !dbg !120 + %611 = insertelement <2 x bfloat> poison, bfloat %603, i64 0, !dbg !120 + %612 = insertelement <2 x bfloat> %611, bfloat %608, i64 1, !dbg !120 + %613 = bitcast <2 x bfloat> %612 to i32, !dbg !120 + %614 = insertelement <2 x bfloat> poison, bfloat %605, i64 0, !dbg !120 + %615 = insertelement <2 x bfloat> %614, bfloat %610, i64 1, !dbg !120 + %616 = bitcast <2 x bfloat> %615 to i32, !dbg !120 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %613, i32 %616, ptr addrspace(1) %571, i1 true) #6, !dbg !120 + %617 = getelementptr bfloat, ptr addrspace(1) %1, i64 %570, !dbg !121 + %618 = fptrunc float %564 to bfloat, !dbg !122 + %619 = fptrunc float %565 to bfloat, !dbg !122 + %620 = fptrunc float %566 to bfloat, !dbg !122 + %621 = fptrunc float %567 to bfloat, !dbg !122 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !122 + store bfloat %618, ptr addrspace(3) %588, align 2, !dbg !122 + store bfloat %619, ptr addrspace(3) %590, align 2, !dbg !122 + store bfloat %620, ptr addrspace(3) %592, align 2, !dbg !122 + store bfloat %621, ptr addrspace(3) %594, align 2, !dbg !122 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !122 + %622 = load bfloat, ptr addrspace(3) %602, align 2, !dbg !122 + %623 = load bfloat, ptr addrspace(3) %604, align 2, !dbg !122 + %624 = load bfloat, ptr addrspace(3) %607, align 2, !dbg !122 + %625 = load bfloat, ptr addrspace(3) %609, align 2, !dbg !122 + %626 = insertelement <2 x bfloat> poison, bfloat %622, i64 0, !dbg !122 + %627 = insertelement <2 x bfloat> %626, bfloat %624, i64 1, !dbg !122 + %628 = bitcast <2 x bfloat> %627 to i32, !dbg !122 + %629 = insertelement <2 x bfloat> poison, bfloat %623, i64 0, !dbg !122 + %630 = insertelement <2 x bfloat> %629, bfloat %625, i64 1, !dbg !122 + %631 = bitcast <2 x bfloat> %630 to i32, !dbg !122 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %628, i32 %631, ptr addrspace(1) %617, i1 true) #6, !dbg !122 + ret void, !dbg !123 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 23, column: 33, scope: !5) +!10 = !DILocation(line: 24, column: 44, scope: !5) +!11 = !DILocation(line: 24, column: 23, scope: !5) +!12 = !DILocation(line: 26, column: 37, scope: !5) +!13 = !DILocation(line: 29, column: 19, scope: !5) +!14 = !DILocation(line: 28, column: 19, scope: !5) +!15 = !DILocation(line: 39, column: 41, scope: !5) +!16 = !DILocation(line: 39, column: 52, scope: !5) +!17 = !DILocation(line: 39, column: 48, scope: !5) +!18 = !DILocation(line: 39, column: 63, scope: !5) +!19 = !DILocation(line: 39, column: 57, scope: !5) +!20 = !DILocation(line: 39, column: 34, scope: !5) +!21 = !DILocation(line: 39, column: 68, scope: !5) +!22 = !DILocation(line: 39, column: 121, scope: !5) +!23 = !DILocation(line: 40, column: 41, scope: !5) +!24 = !DILocation(line: 40, column: 50, scope: !5) +!25 = !DILocation(line: 40, column: 34, scope: !5) +!26 = !DILocation(line: 40, column: 61, scope: !5) +!27 = !DILocation(line: 40, column: 114, scope: !5) +!28 = !DILocation(line: 42, column: 22, scope: !5) +!29 = !DILocation(line: 47, column: 22, scope: !5) +!30 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !33) +!31 = distinct !DILexicalBlockFile(scope: !5, file: !32, discriminator: 0) +!32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!33 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !34) +!34 = !DILocation(line: 51, column: 25, scope: !35) +!35 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!36 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !37) +!37 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !38) +!38 = !DILocation(line: 52, column: 27, scope: !35) +!39 = !DILocation(line: 58, column: 27, scope: !5) +!40 = !DILocation(line: 62, column: 35, scope: !5) +!41 = !DILocation(line: 62, column: 42, scope: !5) +!42 = !DILocation(line: 62, column: 95, scope: !5) +!43 = !DILocation(line: 63, column: 46, scope: !5) +!44 = !DILocation(line: 63, column: 42, scope: !5) +!45 = !DILocation(line: 63, column: 35, scope: !5) +!46 = !DILocation(line: 63, column: 51, scope: !5) +!47 = !DILocation(line: 64, column: 35, scope: !5) +!48 = !DILocation(line: 64, column: 51, scope: !5) +!49 = !DILocation(line: 65, column: 69, scope: !5) +!50 = !DILocation(line: 66, column: 36, scope: !5) +!51 = !DILocation(line: 66, column: 43, scope: !5) +!52 = !DILocation(line: 71, column: 24, scope: !5) +!53 = !DILocation(line: 72, column: 41, scope: !5) +!54 = !DILocation(line: 72, column: 39, scope: !5) +!55 = !DILocation(line: 72, column: 48, scope: !5) +!56 = !DILocation(line: 72, column: 57, scope: !5) +!57 = !DILocation(line: 72, column: 35, scope: !5) +!58 = !DILocation(line: 72, column: 68, scope: !5) +!59 = !DILocation(line: 75, column: 25, scope: !5) +!60 = !DILocation(line: 77, column: 24, scope: !5) +!61 = !DILocation(line: 78, column: 32, scope: !5) +!62 = !DILocation(line: 79, column: 24, scope: !5) +!63 = !DILocation(line: 80, column: 35, scope: !5) +!64 = !DILocation(line: 80, column: 85, scope: !5) +!65 = !DILocation(line: 87, column: 25, scope: !5) +!66 = !DILocation(line: 90, column: 53, scope: !5) +!67 = !DILocation(line: 90, column: 35, scope: !5) +!68 = !DILocation(line: 90, column: 64, scope: !5) +!69 = !DILocation(line: 98, column: 35, scope: !5) +!70 = !DILocation(line: 98, column: 81, scope: !5) +!71 = !DILocation(line: 111, column: 24, scope: !5) +!72 = !DILocation(line: 113, column: 24, scope: !5) +!73 = !DILocation(line: 116, column: 24, scope: !5) +!74 = !DILocation(line: 121, column: 42, scope: !5) +!75 = !DILocation(line: 121, column: 51, scope: !5) +!76 = !DILocation(line: 121, column: 60, scope: !5) +!77 = !DILocation(line: 121, column: 35, scope: !5) +!78 = !DILocation(line: 121, column: 71, scope: !5) +!79 = !DILocation(line: 123, column: 24, scope: !5) +!80 = !DILocation(line: 124, column: 24, scope: !5) +!81 = !DILocation(line: 125, column: 32, scope: !5) +!82 = !DILocation(line: 121, column: 132, scope: !5) +!83 = !DILocation(line: 72, column: 129, scope: !5) +!84 = !DILocation(line: 80, column: 146, scope: !5) +!85 = !DILocation(line: 82, column: 24, scope: !5) +!86 = !DILocation(line: 84, column: 17, scope: !5) +!87 = !DILocation(line: 90, column: 125, scope: !5) +!88 = !DILocation(line: 97, column: 24, scope: !5) +!89 = !DILocation(line: 98, column: 142, scope: !5) +!90 = !DILocation(line: 100, column: 24, scope: !5) +!91 = !DILocation(line: 0, scope: !5) +!92 = !DILocation(line: 118, column: 24, scope: !5) +!93 = !DILocation(line: 119, column: 24, scope: !5) +!94 = !DILocation(line: 66, column: 96, scope: !5) +!95 = !DILocation(line: 65, column: 123, scope: !5) +!96 = !DILocation(line: 126, column: 24, scope: !5) +!97 = !DILocation(line: 127, column: 35, scope: !5) +!98 = !DILocation(line: 127, column: 85, scope: !5) +!99 = !DILocation(line: 127, column: 146, scope: !5) +!100 = !DILocation(line: 129, column: 24, scope: !5) +!101 = !DILocation(line: 131, column: 17, scope: !5) +!102 = !DILocation(line: 134, column: 51, scope: !5) +!103 = !DILocation(line: 134, column: 60, scope: !5) +!104 = !DILocation(line: 134, column: 35, scope: !5) +!105 = !DILocation(line: 134, column: 71, scope: !5) +!106 = !DILocation(line: 134, column: 132, scope: !5) +!107 = !DILocation(line: 139, column: 24, scope: !5) +!108 = !DILocation(line: 140, column: 35, scope: !5) +!109 = !DILocation(line: 140, column: 81, scope: !5) +!110 = !DILocation(line: 140, column: 142, scope: !5) +!111 = !DILocation(line: 142, column: 24, scope: !5) +!112 = !DILocation(line: 151, column: 25, scope: !5) +!113 = !DILocation(line: 153, column: 26, scope: !5) +!114 = !DILocation(line: 156, column: 26, scope: !5) +!115 = !DILocation(line: 158, column: 26, scope: !5) +!116 = !DILocation(line: 159, column: 26, scope: !5) +!117 = !DILocation(line: 161, column: 43, scope: !5) +!118 = !DILocation(line: 161, column: 39, scope: !5) +!119 = !DILocation(line: 161, column: 32, scope: !5) +!120 = !DILocation(line: 161, column: 55, scope: !5) +!121 = !DILocation(line: 162, column: 32, scope: !5) +!122 = !DILocation(line: 162, column: 56, scope: !5) +!123 = !DILocation(line: 53, column: 4, scope: !5) diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..fdd894cfc5a02fcb56e2bd11319824abe46ba5ea --- /dev/null +++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx @@ -0,0 +1,1404 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10 +) +.reqntid 256 +{ + .reg .pred %p<4>; + .reg .b16 %rs<70>; + .reg .b32 %r<306>; + .reg .b64 %rd<97>; + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd80, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0]; + ld.param.b64 %rd81, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28 + mov.u32 %r24, %ctaid.x; + .loc 1 23 33 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33 + shl.b32 %r25, %r24, 3; + ld.param.b64 %rd82, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2]; + ld.param.b64 %rd83, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3]; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + mov.u32 %r26, %tid.x; + and.b32 %r27, %r26, 224; + ld.param.b64 %rd84, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4]; + bfe.u32 %r28, %r26, 5, 3; + ld.param.b64 %rd85, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5]; + and.b32 %r29, %r26, 7; + ld.param.b64 %rd86, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6]; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r30, %r28, %r25; + or.b32 %r31, %r25, %r29; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + and.b32 %r32, %r26, 31; + shl.b32 %r33, %r32, 2; + shr.u32 %r34, %r26, 3; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + bfe.s32 %r35, %r24, 28, 1; + shr.u32 %r36, %r35, 27; + add.s32 %r37, %r30, %r36; + shr.s32 %r38, %r37, 5; + .loc 1 28 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:28:19 + and.b32 %r39, %r37, 33554400; + sub.s32 %r40, %r30, %r39; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + add.s32 %r41, %r31, %r36; + .loc 1 39 52 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:52 + shl.b32 %r42, %r40, 7; + .loc 1 39 48 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:48 + or.b32 %r43, %r42, %r33; + mad.lo.s32 %r44, %r38, 36864, %r43; + .loc 1 39 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57 + add.s32 %r45, %r44, 4096; + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + mad.wide.s32 %rd1, %r45, 2, %rd82; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs34, %rs35}, %r1; + mov.b32 {%rs36, %rs37}, %r2; + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r46, %rs34; + cvt.f32.bf16 %r47, %rs35; + cvt.f32.bf16 %r48, %rs36; + cvt.f32.bf16 %r49, %rs37; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + mad.wide.s32 %rd3, %r44, 2, %rd82; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4, %r3; + mov.u32 %r5, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4; + // end inline asm + mov.b32 {%rs38, %rs39}, %r4; + mov.b32 {%rs40, %rs41}, %r5; + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r50, %rs38; + cvt.f32.bf16 %r51, %rs39; + cvt.f32.bf16 %r52, %rs40; + cvt.f32.bf16 %r53, %rs41; + .loc 1 42 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22 + mul.f32 %r54, %r47, %r47; + .loc 1 47 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22 + mul.f32 %r55, %r51, %r51; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + fma.rn.f32 %r56, %r46, %r46, %r54; + fma.rn.f32 %r57, %r48, %r48, %r56; + fma.rn.f32 %r58, %r49, %r49, %r57; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r59, %r58, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r60, %r58, %r59; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r61, %r60, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r62, %r60, %r61; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r63, %r62, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r64, %r62, %r63; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r65, %r64, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r66, %r64, %r65; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r67, %r66, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r68, %r66, %r67; +$L__tmp12: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + fma.rn.f32 %r69, %r50, %r50, %r55; + fma.rn.f32 %r70, %r52, %r52, %r69; + fma.rn.f32 %r71, %r53, %r53, %r70; +$L__tmp13: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r72, %r71, 16, 31, -1; +$L__tmp14: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r73, %r71, %r72; +$L__tmp15: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r74, %r73, 8, 31, -1; +$L__tmp16: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r75, %r73, %r74; +$L__tmp17: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r76, %r75, 4, 31, -1; +$L__tmp18: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r77, %r75, %r76; +$L__tmp19: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r78, %r77, 2, 31, -1; +$L__tmp20: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r79, %r77, %r78; +$L__tmp21: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r80, %r79, 1, 31, -1; +$L__tmp22: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r81, %r79, %r80; +$L__tmp23: + .loc 1 62 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35 + mul.wide.u32 %rd87, %r33, 2; + add.s64 %rd5, %rd83, %rd87; + .loc 1 62 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r3; + mov.u32 %r7, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r6, %r7 }, [ %rd5 + 0 ], %rd6; + // end inline asm + mov.b32 {%rs42, %rs43}, %r6; + mov.b32 {%rs44, %rs45}, %r7; + .loc 1 62 95 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95 + cvt.f32.bf16 %r82, %rs42; + cvt.f32.bf16 %r83, %rs43; + cvt.f32.bf16 %r84, %rs44; + cvt.f32.bf16 %r85, %rs45; + .loc 1 63 46 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46 + shl.b32 %r86, %r38, 7; + .loc 1 63 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42 + or.b32 %r87, %r86, %r33; + .loc 1 63 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35 + mul.wide.s32 %rd88, %r87, 4; + add.s64 %rd7, %rd84, %rd88; + .loc 1 63 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51 + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r8, %r3; + mov.u32 %r9, %r3; + mov.u32 %r10, %r3; + mov.u32 %r11, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r8, %r9, %r10, %r11 }, [ %rd7 + 0 ], %rd8; + // end inline asm + .loc 1 64 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35 + add.s64 %rd9, %rd85, %rd88; + .loc 1 64 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51 + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r12, %r3; + mov.u32 %r13, %r3; + mov.u32 %r14, %r3; + mov.u32 %r15, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r12, %r13, %r14, %r15 }, [ %rd9 + 0 ], %rd10; + // end inline asm + shl.b32 %r88, %r29, 4; + shl.b32 %r89, %r27, 2; + shr.u32 %r90, %r26, 1; + and.b32 %r91, %r90, 124; + or.b32 %r92, %r88, %r89; + xor.b32 %r93, %r92, %r91; + mov.b32 %r94, global_smem; + add.s32 %r95, %r94, %r93; + st.shared.b32 [%r95], %r12; + xor.b32 %r96, %r93, 4; + add.s32 %r97, %r94, %r96; + st.shared.b32 [%r97+1024], %r13; + xor.b32 %r98, %r93, 8; + add.s32 %r99, %r94, %r98; + st.shared.b32 [%r99+2048], %r14; + xor.b32 %r100, %r93, 12; + add.s32 %r101, %r94, %r100; + st.shared.b32 [%r101+3072], %r15; + bar.sync 0; + shl.b32 %r102, %r32, 7; + xor.b32 %r103, %r88, %r91; + or.b32 %r104, %r103, %r102; + add.s32 %r105, %r94, %r104; + ld.shared.b32 %r106, [%r105]; + xor.b32 %r107, %r104, 4; + add.s32 %r108, %r94, %r107; + ld.shared.b32 %r109, [%r108]; + xor.b32 %r110, %r104, 8; + add.s32 %r111, %r94, %r110; + ld.shared.b32 %r112, [%r111]; + xor.b32 %r113, %r104, 12; + add.s32 %r114, %r94, %r113; + ld.shared.b32 %r115, [%r114]; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r16, %r3; + mov.u32 %r17, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r16, %r17 }, [ %rd1 + 0 ], %rd11; + // end inline asm + .loc 1 66 36 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36 + add.s64 %rd12, %rd86, %rd87; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r18, %r3; + mov.u32 %r19, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r18, %r19 }, [ %rd12 + 0 ], %rd13; + // end inline asm + .loc 1 71 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:71:24 + and.b32 %r116, %r34, 1; + setp.ne.b32 %p3, %r116, 0; + not.pred %p2, %p3; + .loc 1 72 41 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41 + and.b32 %r117, %r34, 30; + .loc 1 72 39 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:39 + or.b32 %r118, %r34, 97; + .loc 1 72 48 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:48 + shl.b32 %r119, %r31, 7; + shl.b32 %r120, %r41, 10; + and.b32 %r121, %r120, -32768; + add.s32 %r122, %r121, %r119; + .loc 1 72 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:57 + or.b32 %r123, %r122, %r118; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + cvt.s64.s32 %rd89, %r122; + cvt.u64.u32 %rd90, %r117; + or.b64 %rd91, %rd89, %rd90; + shl.b64 %rd92, %rd91, 1; + add.s64 %rd93, %rd82, %rd92; + add.s64 %rd14, %rd93, 2; + add.s64 %rd16, %rd93, 66; + add.s64 %rd18, %rd93, 130; + mad.wide.s32 %rd20, %r123, 2, %rd82; + .loc 1 72 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0; + // end inline asm + mov.b16 %rs2, 0; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd14 + 0 ], %rd15; + // end inline asm + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd16 + 0 ], %rd17; + // end inline asm + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs4, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd18 + 0 ], %rd19; + // end inline asm + // begin inline asm + mov.u64 %rd21, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd20 + 0 ], %rd21; + // end inline asm + mov.b32 %r124, 0f43000000; + .loc 1 75 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25 + div.full.f32 %r125, %r81, %r124; + .loc 1 77 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24 + add.f32 %r126, %r125, 0f358637BD; + .loc 1 78 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32 + rsqrt.approx.ftz.f32 %r127, %r126; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + bar.sync 0; + shr.u32 %r128, %r27, 3; + add.s32 %r129, %r94, %r128; + st.shared.b32 [%r129], %r127; + bar.sync 0; + shl.b32 %r130, %r29, 2; + add.s32 %r131, %r94, %r130; + ld.shared.b32 %r132, [%r131]; + .loc 1 80 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35 + mul.wide.u32 %rd94, %r117, 2; + add.s64 %rd38, %rd83, %rd94; + add.s64 %rd22, %rd38, 2; + add.s64 %rd24, %rd38, 66; + add.s64 %rd26, %rd38, 130; + mul.wide.u32 %rd95, %r118, 2; + add.s64 %rd28, %rd83, %rd95; + .loc 1 80 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs6, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd22 + 0 ], %rd23; + // end inline asm + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs7, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd24 + 0 ], %rd25; + // end inline asm + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs8, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd26 + 0 ], %rd27; + // end inline asm + // begin inline asm + mov.u64 %rd29, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs9, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd28 + 0 ], %rd29; + // end inline asm + .loc 1 90 53 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53 + or.b32 %r133, %r122, %r117; + .loc 1 90 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35 + mad.wide.s32 %rd30, %r133, 2, %rd82; + add.s64 %rd32, %rd93, 64; + add.s64 %rd34, %rd93, 128; + add.s64 %rd36, %rd93, 192; + .loc 1 90 64 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs10, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd30 + 0 ], %rd31; + // end inline asm + // begin inline asm + mov.u64 %rd33, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd32 + 0 ], %rd33; + // end inline asm + // begin inline asm + mov.u64 %rd35, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd34 + 0 ], %rd35; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd36 + 0 ], %rd37; + // end inline asm + .loc 1 98 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35 + add.s64 %rd40, %rd38, 64; + add.s64 %rd42, %rd38, 128; + add.s64 %rd44, %rd38, 192; + .loc 1 98 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81 + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs14, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd38 + 0 ], %rd39; + // end inline asm + // begin inline asm + mov.u64 %rd41, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd40 + 0 ], %rd41; + // end inline asm + // begin inline asm + mov.u64 %rd43, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs16, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd42 + 0 ], %rd43; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd44 + 0 ], %rd45; + // end inline asm + .loc 1 111 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24 + mul.f32 %r134, %r127, %r50; + mul.f32 %r135, %r127, %r51; + mul.f32 %r136, %r127, %r52; + mul.f32 %r137, %r127, %r53; + .loc 1 113 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24 + mul.f32 %r138, %r134, %r82; + mul.f32 %r139, %r135, %r83; + mul.f32 %r140, %r136, %r84; + mul.f32 %r141, %r137, %r85; + .loc 1 116 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24 + mul.f32 %r142, %r138, %r8; + mul.f32 %r143, %r139, %r9; + mul.f32 %r144, %r140, %r10; + mul.f32 %r145, %r141, %r11; + bar.sync 0; + st.shared.b32 [%r95], %r142; + st.shared.b32 [%r97+1024], %r143; + st.shared.b32 [%r99+2048], %r144; + st.shared.b32 [%r101+3072], %r145; + bar.sync 0; + ld.shared.b32 %r146, [%r105]; + ld.shared.b32 %r147, [%r108]; + ld.shared.b32 %r148, [%r111]; + ld.shared.b32 %r149, [%r114]; + .loc 1 121 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:42 + or.b32 %r150, %r34, 4193; + .loc 1 121 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60 + add.s32 %r151, %r133, 4097; + add.s32 %r152, %r133, 4129; + add.s32 %r153, %r133, 4161; + add.s32 %r154, %r122, %r150; + .loc 1 121 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35 + mad.wide.s32 %rd46, %r151, 2, %rd82; + mad.wide.s32 %rd48, %r152, 2, %rd82; + mad.wide.s32 %rd50, %r153, 2, %rd82; + mad.wide.s32 %rd52, %r154, 2, %rd82; + .loc 1 121 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71 + // begin inline asm + mov.u64 %rd47, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs18, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd46 + 0 ], %rd47; + // end inline asm + // begin inline asm + mov.u64 %rd49, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd48 + 0 ], %rd49; + // end inline asm + // begin inline asm + mov.u64 %rd51, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs20, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd50 + 0 ], %rd51; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd52 + 0 ], %rd53; + // end inline asm + .loc 1 123 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24 + div.full.f32 %r155, %r68, %r124; + .loc 1 124 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24 + add.f32 %r156, %r155, 0f358637BD; + .loc 1 125 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32 + rsqrt.approx.ftz.f32 %r157, %r156; + .loc 1 121 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132 + cvt.f32.bf16 %r158, %rs21; + cvt.f32.bf16 %r159, %rs20; + cvt.f32.bf16 %r160, %rs19; + cvt.f32.bf16 %r161, %rs18; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r162, %rs5; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r163, %r132, %r162; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r164, %rs9; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r165, %r163; + fma.rn.f32 %r166, %r165, %r164, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r167, %rs13; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r168, %r132, %r167; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r169, %rs17; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r170, %r168, %r169; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r171, %r170, %r166, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r172, %r115, %r171, %r149; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r173, %rs4; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r174, %r132, %r173; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r175, %rs8; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r176, %r174; + fma.rn.f32 %r177, %r176, %r175, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r178, %rs12; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r179, %r132, %r178; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r180, %rs16; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r181, %r179, %r180; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r182, %r181, %r177, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r183, %r112, %r182, %r148; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r184, %rs3; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r185, %r132, %r184; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r186, %rs7; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r187, %r185; + fma.rn.f32 %r188, %r187, %r186, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r189, %rs11; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r190, %r132, %r189; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r191, %rs15; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r192, %r190, %r191; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r193, %r192, %r188, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r194, %r109, %r193, %r147; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r195, %rs1; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r196, %r132, %r195; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r197, %rs6; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r198, %r196; + fma.rn.f32 %r199, %r198, %r197, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r200, %rs10; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r201, %r132, %r200; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r202, %rs14; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r203, %r201, %r202; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r204, %r203, %r199, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r205, %r106, %r204, %r146; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + mov.b32 {%rs46, %rs47}, %r19; + .loc 1 66 96 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96 + cvt.f32.bf16 %r206, %rs47; + cvt.f32.bf16 %r207, %rs46; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + mov.b32 {%rs48, %rs49}, %r18; + .loc 1 66 96 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96 + cvt.f32.bf16 %r208, %rs49; + cvt.f32.bf16 %r209, %rs48; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + mov.b32 {%rs50, %rs51}, %r17; + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r210, %rs51; + cvt.f32.bf16 %r211, %rs50; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + mov.b32 {%rs52, %rs53}, %r16; + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r212, %rs53; + cvt.f32.bf16 %r213, %rs52; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + bar.sync 0; + st.shared.b32 [%r129], %r157; + bar.sync 0; + ld.shared.b32 %r214, [%r131]; + mul.f32 %r215, %r214, %r161; + mul.f32 %r216, %r214, %r160; + mul.f32 %r217, %r214, %r159; + mul.f32 %r218, %r214, %r158; + .loc 1 127 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35 + add.s64 %rd70, %rd86, %rd94; + add.s64 %rd54, %rd70, 2; + add.s64 %rd56, %rd70, 66; + add.s64 %rd58, %rd70, 130; + add.s64 %rd60, %rd86, %rd95; + .loc 1 127 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85 + // begin inline asm + mov.u64 %rd55, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs22, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd54 + 0 ], %rd55; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd56 + 0 ], %rd57; + // end inline asm + // begin inline asm + mov.u64 %rd59, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd59, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs24, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd58 + 0 ], %rd59; + // end inline asm + // begin inline asm + mov.u64 %rd61, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd61, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs25, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd60 + 0 ], %rd61; + // end inline asm + .loc 1 127 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146 + cvt.f32.bf16 %r219, %rs22; + cvt.f32.bf16 %r220, %rs23; + cvt.f32.bf16 %r221, %rs24; + cvt.f32.bf16 %r222, %rs25; + .loc 1 131 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17 + neg.f32 %r223, %r215; + fma.rn.f32 %r224, %r223, %r219, 0f00000000; + neg.f32 %r225, %r216; + fma.rn.f32 %r226, %r225, %r220, 0f00000000; + neg.f32 %r227, %r217; + fma.rn.f32 %r228, %r227, %r221, 0f00000000; + neg.f32 %r229, %r218; + fma.rn.f32 %r230, %r229, %r222, 0f00000000; + .loc 1 134 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60 + add.s32 %r231, %r133, 4096; + add.s32 %r232, %r133, 4128; + add.s32 %r233, %r133, 4160; + add.s32 %r234, %r133, 4192; + .loc 1 134 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35 + mad.wide.s32 %rd62, %r231, 2, %rd82; + mad.wide.s32 %rd64, %r232, 2, %rd82; + mad.wide.s32 %rd66, %r233, 2, %rd82; + mad.wide.s32 %rd68, %r234, 2, %rd82; + .loc 1 134 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71 + // begin inline asm + mov.u64 %rd63, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs26, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd62 + 0 ], %rd63; + // end inline asm + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd64 + 0 ], %rd65; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs28, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd66 + 0 ], %rd67; + // end inline asm + // begin inline asm + mov.u64 %rd69, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd69, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs29, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd68 + 0 ], %rd69; + // end inline asm + .loc 1 134 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132 + cvt.f32.bf16 %r235, %rs26; + cvt.f32.bf16 %r236, %rs27; + cvt.f32.bf16 %r237, %rs28; + cvt.f32.bf16 %r238, %rs29; + .loc 1 139 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24 + mul.f32 %r239, %r214, %r235; + mul.f32 %r240, %r214, %r236; + mul.f32 %r241, %r214, %r237; + mul.f32 %r242, %r214, %r238; + .loc 1 140 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35 + add.s64 %rd72, %rd70, 64; + add.s64 %rd74, %rd70, 128; + add.s64 %rd76, %rd70, 192; + .loc 1 140 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81 + // begin inline asm + mov.u64 %rd71, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs30, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd70 + 0 ], %rd71; + // end inline asm + // begin inline asm + mov.u64 %rd73, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd73, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs31, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd72 + 0 ], %rd73; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd75, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs32, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd74 + 0 ], %rd75; + // end inline asm + // begin inline asm + mov.u64 %rd77, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd77, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs33, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd76 + 0 ], %rd77; + // end inline asm + .loc 1 140 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142 + cvt.f32.bf16 %r243, %rs30; + cvt.f32.bf16 %r244, %rs31; + cvt.f32.bf16 %r245, %rs32; + cvt.f32.bf16 %r246, %rs33; + .loc 1 142 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24 + mul.f32 %r247, %r239, %r243; + mul.f32 %r248, %r240, %r244; + mul.f32 %r249, %r241, %r245; + mul.f32 %r250, %r242, %r246; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r251, %r247, %r224, %p3; + selp.f32 %r252, %r248, %r226, %p3; + selp.f32 %r253, %r249, %r228, %p3; + selp.f32 %r254, %r250, %r230, %p3; + .loc 1 151 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25 + mul.f32 %r255, %r157, %r213; + mul.f32 %r256, %r157, %r212; + mul.f32 %r257, %r157, %r211; + mul.f32 %r258, %r157, %r210; + .loc 1 153 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26 + mul.f32 %r259, %r255, %r209; + mul.f32 %r260, %r256, %r208; + mul.f32 %r261, %r257, %r207; + mul.f32 %r262, %r258, %r206; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + mul.f32 %r263, %r259, %r8; + mul.f32 %r264, %r260, %r9; + mul.f32 %r265, %r261, %r10; + mul.f32 %r266, %r262, %r11; + bar.sync 0; + st.shared.b32 [%r95], %r263; + st.shared.b32 [%r97+1024], %r264; + st.shared.b32 [%r99+2048], %r265; + st.shared.b32 [%r101+3072], %r266; + bar.sync 0; + ld.shared.b32 %r267, [%r105]; + ld.shared.b32 %r268, [%r108]; + ld.shared.b32 %r269, [%r111]; + ld.shared.b32 %r270, [%r114]; + .loc 1 159 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26 + fma.rn.f32 %r271, %r106, %r251, %r267; + fma.rn.f32 %r272, %r109, %r252, %r268; + fma.rn.f32 %r273, %r112, %r253, %r269; + fma.rn.f32 %r274, %r115, %r254, %r270; + .loc 1 161 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:43 + shl.b32 %r275, %r30, 7; + .loc 1 161 39 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39 + or.b32 %r276, %r275, %r33; + .loc 1 161 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32 + mul.wide.s32 %rd96, %r276, 2; + add.s64 %rd78, %rd80, %rd96; + .loc 1 161 55 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55 + cvt.rn.bf16.f32 %rs54, %r205; + cvt.rn.bf16.f32 %rs55, %r194; + cvt.rn.bf16.f32 %rs56, %r183; + cvt.rn.bf16.f32 %rs57, %r172; + bar.sync 0; + and.b32 %r277, %r26, 15; + shl.b32 %r278, %r277, 7; + shl.b32 %r279, %r277, 3; + and.b32 %r280, %r34, 24; + shr.u32 %r281, %r26, 2; + and.b32 %r282, %r281, 4; + shr.u32 %r283, %r26, 4; + and.b32 %r284, %r283, 2; + or.b32 %r285, %r278, %r282; + or.b32 %r286, %r285, %r284; + xor.b32 %r287, %r279, %r280; + or.b32 %r288, %r286, %r287; + add.s32 %r289, %r94, %r288; + st.shared.b16 [%r289], %rs54; + xor.b32 %r290, %r288, 32; + add.s32 %r291, %r94, %r290; + st.shared.b16 [%r291], %rs55; + xor.b32 %r292, %r288, 64; + add.s32 %r293, %r94, %r292; + st.shared.b16 [%r293], %rs56; + xor.b32 %r294, %r288, 96; + add.s32 %r295, %r94, %r294; + st.shared.b16 [%r295], %rs57; + bar.sync 0; + shl.b32 %r296, %r26, 2; + and.b32 %r297, %r296, 1016; + shr.u32 %r298, %r27, 2; + shl.b32 %r299, %r26, 1; + and.b32 %r300, %r299, 2; + xor.b32 %r301, %r297, %r298; + or.b32 %r302, %r301, %r300; + add.s32 %r303, %r94, %r302; + ld.shared.b16 %rs58, [%r303]; + ld.shared.b16 %rs59, [%r303+4]; + xor.b32 %r304, %r302, 64; + add.s32 %r305, %r94, %r304; + ld.shared.b16 %rs60, [%r305+1024]; + ld.shared.b16 %rs61, [%r305+1028]; + mov.b32 %r20, {%rs58, %rs60}; + mov.b32 %r21, {%rs59, %rs61}; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd78 + 0 ], { %r20, %r21 }; + // end inline asm + .loc 1 162 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32 + add.s64 %rd79, %rd81, %rd96; + .loc 1 162 56 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56 + cvt.rn.bf16.f32 %rs62, %r271; + cvt.rn.bf16.f32 %rs63, %r272; + cvt.rn.bf16.f32 %rs64, %r273; + cvt.rn.bf16.f32 %rs65, %r274; + bar.sync 0; + st.shared.b16 [%r289], %rs62; + st.shared.b16 [%r291], %rs63; + st.shared.b16 [%r293], %rs64; + st.shared.b16 [%r295], %rs65; + bar.sync 0; + ld.shared.b16 %rs66, [%r303]; + ld.shared.b16 %rs67, [%r303+4]; + ld.shared.b16 %rs68, [%r305+1024]; + ld.shared.b16 %rs69, [%r305+1028]; + mov.b32 %r22, {%rs66, %rs68}; + mov.b32 %r23, {%rs67, %rs69}; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd79 + 0 ], { %r22, %r23 }; + // end inline asm + .loc 1 53 4 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4 + ret; +$L__tmp24: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 456 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 118 +.b8 113 +.b8 104 +.b8 106 +.b8 116 +.b8 121 +.b8 103 +.b8 55 +.b8 102 +.b8 118 +.b8 120 +.b8 122 +.b8 119 +.b8 116 +.b8 98 +.b8 116 +.b8 116 +.b8 52 +.b8 118 +.b8 114 +.b8 100 +.b8 107 +.b8 98 +.b8 110 +.b8 98 +.b8 54 +.b8 110 +.b8 51 +.b8 50 +.b8 102 +.b8 110 +.b8 114 +.b8 105 +.b8 106 +.b8 106 +.b8 112 +.b8 108 +.b8 51 +.b8 118 +.b8 118 +.b8 52 +.b8 99 +.b8 102 +.b8 113 +.b8 100 +.b8 52 +.b8 109 +.b8 122 +.b8 110 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 98 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 101 +.b8 103 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 119 +.b8 105 +.b8 116 +.b8 104 +.b8 95 +.b8 115 +.b8 105 +.b8 122 +.b8 101 +.b8 115 +.b8 95 +.b8 115 +.b8 116 +.b8 97 +.b8 99 +.b8 107 +.b8 95 +.b8 117 +.b8 110 +.b8 98 +.b8 105 +.b8 110 +.b8 100 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x151:0x7a DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 4 // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..35b704f94f17e6e09792453584cc30d2e765f42b --- /dev/null +++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source @@ -0,0 +1,972 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc215 = loc(unknown) +#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc222 = loc("in_out_ptr0"(#loc)) +#loc223 = loc("in_out_ptr1"(#loc)) +#loc224 = loc("in_ptr0"(#loc)) +#loc225 = loc("in_ptr1"(#loc)) +#loc226 = loc("in_ptr2"(#loc)) +#loc227 = loc("in_ptr3"(#loc)) +#loc228 = loc("in_ptr4"(#loc)) +#loc229 = loc("xnumel"(#loc)) +#loc230 = loc("r0_numel"(#loc)) +#loc432 = loc("input"(#loc213)) +#loc433 = loc("a"(#loc218)) +#loc434 = loc("b"(#loc218)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 73728 : i32 loc(#loc231) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc232) + %xoffset = tt.get_program_id x : i32 loc(#loc233) + %xoffset_2 = arith.constant 8 : i32 loc(#loc234) + %xoffset_3 = arith.constant 8 : i32 loc(#loc234) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc235) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc236) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc237) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc237) + %xmask = arith.constant true loc(#loc238) + %xmask_8 = arith.constant dense : tensor<8x128xi1> loc(#loc238) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc239) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc240) + %x0 = arith.constant 32 : i32 loc(#loc241) + %x0_10 = arith.constant 32 : i32 loc(#loc241) + %x0_11 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc241) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc241) + %x1 = arith.constant 32 : i32 loc(#loc242) + %x1_13 = arith.constant 32 : i32 loc(#loc242) + %x1_14 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc242) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc242) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc243) + %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244) + %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc244) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c128_i32 = arith.constant 128 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<8x128xf32>, tensor<8x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc246) + %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc246) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc247) + %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x128xi32> loc(#loc247) + %tmp0 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_27 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_28 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc248) + %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x128xi32> loc(#loc248) + %tmp0_30 = arith.constant 128 : i32 loc(#loc249) + %tmp0_31 = arith.constant 128 : i32 loc(#loc249) + %tmp0_32 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc249) + %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<8x1xi32> loc(#loc249) + %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc250) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc250) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<8x128xi32> loc(#loc250) + %tmp0_37 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_38 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_39 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc251) + %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<8x1xi32> loc(#loc251) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc252) + %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<8x128xi32> loc(#loc252) + %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc253) + %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc253) + %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254) + %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc254) + %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc254) + %tmp0_48 = arith.truncf %tmp0_47 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc254) + %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc254) + %tmp0_50 = arith.extf %tmp0_49 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc255) + %tmp6 = arith.constant 128 : i32 loc(#loc256) + %tmp6_51 = arith.constant 128 : i32 loc(#loc256) + %tmp6_52 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc256) + %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<8x1xi32> loc(#loc256) + %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc257) + %tmp6_55 = tt.broadcast %tmp6_53 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc257) + %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<8x128xi32> loc(#loc257) + %tmp6_57 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_58 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_59 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc258) + %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<8x1xi32> loc(#loc258) + %tmp6_61 = tt.broadcast %tmp6_60 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc259) + %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<8x128xi32> loc(#loc259) + %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc260) + %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc260) + %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261) + %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc261) + %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc261) + %tmp6_68 = arith.truncf %tmp6_67 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc261) + %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc261) + %tmp6_70 = arith.extf %tmp6_69 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc262) + %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<8x128xf32> loc(#loc263) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<8x128xf32> loc(#loc264) + %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc265) + %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc265) + %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<8x128xf32> loc(#loc266) + %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<8x128xf32> loc(#loc267) + %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc268) + %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc268) + scf.yield %_tmp4_72, %_tmp10_74 : tensor<8x128xf32>, tensor<8x128xf32> loc(#loc39) + } loc(#loc435) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc269) + %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc270) + %tmp10 = tt.call @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc271) + %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc272) + %c0_i32_21 = arith.constant 0 : i32 loc(#loc44) + %c128_i32_22 = arith.constant 128 : i32 loc(#loc44) + %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44) + %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44) + %6 = arith.bitcast %c128_i32_22 : i32 to i32 loc(#loc44) + %7 = ub.poison : i32 loc(#loc44) + scf.for %r0_offset = %4 to %5 step %6 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc273) + %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc273) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc274) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x128xi32> loc(#loc274) + %r0_3 = arith.constant 2 : i32 loc(#loc275) + %r0_3_25 = arith.constant 2 : i32 loc(#loc275) + %r0_3_26 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc275) + %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x128xi32> loc(#loc275) + %r0_4 = arith.constant 2 : i32 loc(#loc276) + %r0_4_28 = arith.constant 2 : i32 loc(#loc276) + %r0_4_29 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc276) + %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x128xi32> loc(#loc276) + %tmp50 = arith.constant 128 : i32 loc(#loc277) + %tmp50_31 = arith.constant 128 : i32 loc(#loc277) + %tmp50_32 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc277) + %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<8x1xi32> loc(#loc277) + %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc278) + %tmp50_35 = tt.broadcast %tmp50_33 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc278) + %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<8x128xi32> loc(#loc278) + %tmp50_37 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_38 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_39 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc279) + %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<8x1xi32> loc(#loc279) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc280) + %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<8x128xi32> loc(#loc280) + %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc281) + %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc281) + %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282) + %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc282) + %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc282) + %tmp50_48 = arith.truncf %tmp50_47 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc282) + %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc282) + %tmp50_50 = arith.extf %tmp50_49 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc283) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc284) + %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc284) + %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285) + %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc285) + %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc285) + %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc285) + %tmp58_56 = arith.extf %tmp58_55 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc286) + %tmp63 = arith.constant 128 : i32 loc(#loc287) + %tmp63_57 = arith.constant 128 : i32 loc(#loc287) + %tmp63_58 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc287) + %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<8x1xi32> loc(#loc287) + %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc288) + %tmp63_61 = tt.broadcast %tmp63_59 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc288) + %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<8x128xi32> loc(#loc288) + %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc289) + %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc289) + %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290) + %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc290) + %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc290) + %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc290) + %tmp66 = arith.constant 128 : i32 loc(#loc291) + %tmp66_69 = arith.constant 128 : i32 loc(#loc291) + %tmp66_70 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc291) + %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<8x1xi32> loc(#loc291) + %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc292) + %tmp66_73 = tt.broadcast %tmp66_71 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc292) + %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<8x128xi32> loc(#loc292) + %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc293) + %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc293) + %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294) + %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc294) + %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc294) + %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc294) + %tmp96 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_81 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_82 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc295) + %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x128xi32> loc(#loc295) + %tmp96_84 = arith.constant 128 : i32 loc(#loc296) + %tmp96_85 = arith.constant 128 : i32 loc(#loc296) + %tmp96_86 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc296) + %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<8x1xi32> loc(#loc296) + %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc297) + %tmp96_89 = tt.broadcast %tmp96_87 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc297) + %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<8x128xi32> loc(#loc297) + %tmp96_91 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_92 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_93 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc298) + %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<8x1xi32> loc(#loc298) + %tmp96_95 = tt.broadcast %tmp96_94 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc299) + %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<8x128xi32> loc(#loc299) + %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc300) + %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc300) + %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301) + %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc301) + %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc301) + %tmp96_102 = arith.truncf %tmp96_101 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc301) + %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc301) + %tmp96_104 = arith.extf %tmp96_103 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc302) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc303) + %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc303) + %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304) + %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc304) + %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc304) + %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc304) + %tmp102_110 = arith.extf %tmp102_109 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc305) + %tmp13 = arith.constant 0 : i64 loc(#loc306) + %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306) + %tmp14 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc307) + %tmp14_112 = arith.constant dense<0> : tensor<1x128xi64> loc(#loc307) + %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x128xi64> loc(#loc307) + %tmp15 = arith.constant 1 : i64 loc(#loc308) + %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308) + %tmp16 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc309) + %tmp16_115 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc309) + %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x128xi64> loc(#loc309) + %tmp17 = arith.constant 2 : i32 loc(#loc310) + %tmp17_117 = arith.constant 2 : i32 loc(#loc310) + %tmp17_118 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc310) + %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x128xi32> loc(#loc310) + %tmp17_120 = arith.constant 1 : i32 loc(#loc311) + %tmp17_121 = arith.constant 1 : i32 loc(#loc311) + %tmp17_122 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc311) + %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x128xi32> loc(#loc311) + %tmp17_124 = arith.constant 128 : i32 loc(#loc312) + %tmp17_125 = arith.constant 128 : i32 loc(#loc312) + %tmp17_126 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc312) + %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<8x1xi32> loc(#loc312) + %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc313) + %tmp17_129 = tt.broadcast %tmp17_127 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc313) + %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<8x128xi32> loc(#loc313) + %tmp17_131 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_132 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_133 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc314) + %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<8x1xi32> loc(#loc314) + %tmp17_135 = tt.broadcast %tmp17_134 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc315) + %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<8x128xi32> loc(#loc315) + %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc316) + %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc316) + %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc317) + %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318) + %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc318) + %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc318) + %tmp17_143 = arith.truncf %tmp17_142 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc318) + %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc318) + %tmp17_145 = arith.extf %tmp17_144 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc319) + %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320) + %tmp20 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc321) + %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<8x1xf32> loc(#loc321) + %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322) + %tmp22 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc323) + %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<8x1xf32> loc(#loc323) + %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc324) + %tmp24 = tt.broadcast %tmp23 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc325) + %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<8x128xf32> loc(#loc325) + %tmp25 = arith.constant 2 : i32 loc(#loc326) + %tmp25_149 = arith.constant 2 : i32 loc(#loc326) + %tmp25_150 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc326) + %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x128xi32> loc(#loc326) + %tmp25_152 = arith.constant 1 : i32 loc(#loc327) + %tmp25_153 = arith.constant 1 : i32 loc(#loc327) + %tmp25_154 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc327) + %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x128xi32> loc(#loc327) + %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc328) + %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc329) + %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc329) + %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc330) + %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331) + %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc331) + %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc331) + %tmp25_163 = arith.truncf %tmp25_162 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc331) + %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc331) + %tmp25_165 = arith.extf %tmp25_164 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc332) + %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<8x128xf32> loc(#loc333) + %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334) + %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc334) + %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<8x128xf32> loc(#loc334) + %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335) + %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc335) + %tmp31 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc336) + %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc336) + %tmp32 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc337) + %tmp32_170 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc337) + %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x128xi64> loc(#loc337) + %tmp33 = arith.constant 2 : i64 loc(#loc338) + %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338) + %tmp34 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc339) + %tmp34_173 = arith.constant dense<2> : tensor<1x128xi64> loc(#loc339) + %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x128xi64> loc(#loc339) + %tmp35 = arith.constant 2 : i32 loc(#loc340) + %tmp35_175 = arith.constant 2 : i32 loc(#loc340) + %tmp35_176 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc340) + %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x128xi32> loc(#loc340) + %tmp35_178 = arith.constant 128 : i32 loc(#loc341) + %tmp35_179 = arith.constant 128 : i32 loc(#loc341) + %tmp35_180 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc341) + %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<8x1xi32> loc(#loc341) + %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc342) + %tmp35_183 = tt.broadcast %tmp35_181 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc342) + %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<8x128xi32> loc(#loc342) + %tmp35_185 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_186 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_187 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc343) + %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<8x1xi32> loc(#loc343) + %tmp35_189 = tt.broadcast %tmp35_188 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc344) + %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<8x128xi32> loc(#loc344) + %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc345) + %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc345) + %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc346) + %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347) + %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc347) + %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc347) + %tmp35_197 = arith.truncf %tmp35_196 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc347) + %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc347) + %tmp35_199 = arith.extf %tmp35_198 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc348) + %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349) + %tmp38 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc350) + %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<8x1xf32> loc(#loc350) + %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351) + %tmp40 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc352) + %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<8x1xf32> loc(#loc352) + %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc353) + %tmp42 = tt.broadcast %tmp41 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc354) + %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<8x128xf32> loc(#loc354) + %tmp43 = arith.constant 2 : i32 loc(#loc355) + %tmp43_203 = arith.constant 2 : i32 loc(#loc355) + %tmp43_204 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc355) + %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x128xi32> loc(#loc355) + %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc356) + %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc357) + %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc357) + %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc358) + %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359) + %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc359) + %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc359) + %tmp43_213 = arith.truncf %tmp43_212 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc359) + %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc359) + %tmp43_215 = arith.extf %tmp43_214 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc360) + %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<8x128xf32> loc(#loc361) + %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362) + %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc362) + %tmp48 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc363) + %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc363) + %tmp49 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc364) + %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc364) + %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365) + %tmp53 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc366) + %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<8x1xf32> loc(#loc366) + %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367) + %tmp55 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc368) + %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<8x1xf32> loc(#loc368) + %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc369) + %tmp57 = tt.broadcast %tmp56 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc370) + %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<8x128xf32> loc(#loc370) + %tmp60 = tt.broadcast %tmp58_56 : tensor<1x128xf32> -> tensor<8x128xf32> loc(#loc371) + %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<8x128xf32> loc(#loc371) + %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<8x128xf32> loc(#loc372) + %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<8x128xf32> loc(#loc373) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<8x128xf32> loc(#loc374) + %tmp70 = arith.constant 2 : i32 loc(#loc375) + %tmp70_223 = arith.constant 2 : i32 loc(#loc375) + %tmp70_224 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc375) + %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x128xi32> loc(#loc375) + %tmp70_226 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_227 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_228 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc376) + %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x128xi32> loc(#loc376) + %tmp70_230 = arith.constant 128 : i32 loc(#loc377) + %tmp70_231 = arith.constant 128 : i32 loc(#loc377) + %tmp70_232 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc377) + %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<8x1xi32> loc(#loc377) + %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc378) + %tmp70_235 = tt.broadcast %tmp70_233 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc378) + %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<8x128xi32> loc(#loc378) + %tmp70_237 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_238 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_239 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc379) + %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<8x1xi32> loc(#loc379) + %tmp70_241 = tt.broadcast %tmp70_240 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc380) + %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<8x128xi32> loc(#loc380) + %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc381) + %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc381) + %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc382) + %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383) + %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc383) + %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc383) + %tmp70_249 = arith.truncf %tmp70_248 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc383) + %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc383) + %tmp70_251 = arith.extf %tmp70_250 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc384) + %tmp72 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc385) + %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<8x1xf32> loc(#loc385) + %tmp73 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc386) + %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<8x1xf32> loc(#loc386) + %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc387) + %tmp75 = tt.broadcast %tmp74 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc388) + %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<8x128xf32> loc(#loc388) + %tmp76 = arith.constant 2 : i32 loc(#loc389) + %tmp76_255 = arith.constant 2 : i32 loc(#loc389) + %tmp76_256 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc389) + %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x128xi32> loc(#loc389) + %tmp76_258 = arith.constant 1 : i32 loc(#loc390) + %tmp76_259 = arith.constant 1 : i32 loc(#loc390) + %tmp76_260 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc390) + %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x128xi32> loc(#loc390) + %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc391) + %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc392) + %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc392) + %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc393) + %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394) + %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc394) + %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc394) + %tmp76_269 = arith.truncf %tmp76_268 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc394) + %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc394) + %tmp76_271 = arith.extf %tmp76_270 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc395) + %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<8x128xf32> loc(#loc396) + %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397) + %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc397) + %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<8x128xf32> loc(#loc397) + %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398) + %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc398) + %tmp82 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc399) + %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc399) + %tmp83 = arith.constant 2 : i32 loc(#loc400) + %tmp83_276 = arith.constant 2 : i32 loc(#loc400) + %tmp83_277 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc400) + %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x128xi32> loc(#loc400) + %tmp83_279 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_280 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_281 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc401) + %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x128xi32> loc(#loc401) + %tmp83_283 = arith.constant 128 : i32 loc(#loc402) + %tmp83_284 = arith.constant 128 : i32 loc(#loc402) + %tmp83_285 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc402) + %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<8x1xi32> loc(#loc402) + %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc403) + %tmp83_288 = tt.broadcast %tmp83_286 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc403) + %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<8x128xi32> loc(#loc403) + %tmp83_290 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_291 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_292 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc404) + %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<8x1xi32> loc(#loc404) + %tmp83_294 = tt.broadcast %tmp83_293 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc405) + %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<8x128xi32> loc(#loc405) + %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc406) + %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc406) + %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc407) + %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408) + %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc408) + %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc408) + %tmp83_302 = arith.truncf %tmp83_301 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc408) + %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc408) + %tmp83_304 = arith.extf %tmp83_303 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc409) + %tmp85 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc410) + %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<8x1xf32> loc(#loc410) + %tmp86 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc411) + %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<8x1xf32> loc(#loc411) + %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc412) + %tmp88 = tt.broadcast %tmp87 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc413) + %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<8x128xf32> loc(#loc413) + %tmp89 = arith.constant 2 : i32 loc(#loc414) + %tmp89_308 = arith.constant 2 : i32 loc(#loc414) + %tmp89_309 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc414) + %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x128xi32> loc(#loc414) + %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc415) + %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc416) + %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc416) + %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc417) + %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418) + %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc418) + %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc418) + %tmp89_318 = arith.truncf %tmp89_317 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc418) + %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc418) + %tmp89_320 = arith.extf %tmp89_319 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc419) + %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<8x128xf32> loc(#loc420) + %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421) + %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc421) + %tmp94 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc422) + %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc422) + %tmp95 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc423) + %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc423) + %tmp98 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc424) + %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<8x1xf32> loc(#loc424) + %tmp99 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc425) + %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<8x1xf32> loc(#loc425) + %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc426) + %tmp101 = tt.broadcast %tmp100 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc427) + %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<8x128xf32> loc(#loc427) + %tmp104 = tt.broadcast %tmp102_110 : tensor<1x128xf32> -> tensor<8x128xf32> loc(#loc428) + %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<8x128xf32> loc(#loc428) + %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<8x128xf32> loc(#loc429) + %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<8x128xf32> loc(#loc430) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<8x128xf32> loc(#loc431) + %c128_i32_328 = arith.constant 128 : i32 loc(#loc204) + %c128_i32_329 = arith.constant 128 : i32 loc(#loc204) + %cst = arith.constant dense<128> : tensor<8x1xi32> loc(#loc204) + %8 = arith.muli %cst, %xindex_7 : tensor<8x1xi32> loc(#loc204) + %9 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc205) + %10 = tt.broadcast %8 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc205) + %11 = arith.addi %9, %10 : tensor<8x128xi32> loc(#loc205) + %12 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc206) + %13 = tt.addptr %12, %11 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc206) + %14 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc207) + %15 = arith.truncf %tmp68 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc207) + tt.store %13, %15, %14 : tensor<8x128x!tt.ptr> loc(#loc207) + %c128_i32_330 = arith.constant 128 : i32 loc(#loc208) + %c128_i32_331 = arith.constant 128 : i32 loc(#loc208) + %cst_332 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc208) + %16 = arith.muli %cst_332, %xindex_7 : tensor<8x1xi32> loc(#loc208) + %17 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc209) + %18 = tt.broadcast %16 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc209) + %19 = arith.addi %17, %18 : tensor<8x128xi32> loc(#loc209) + %20 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc210) + %21 = tt.addptr %20, %19 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc210) + %22 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc211) + %23 = arith.truncf %tmp110 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc211) + tt.store %21, %23, %22 : tensor<8x128x!tt.ptr> loc(#loc211) + } loc(#loc44) + tt.return loc(#loc212) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x128xf32> loc("input"(#loc213))) -> tensor<8xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214) + tt.reduce.return %2 : f32 loc(#loc214) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc214) + tt.return %0 : tensor<8xf32> loc(#loc216) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xf32> loc(#loc217) + tt.return %1 : tensor<8xf32> loc(#loc217) + } loc(#loc213) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc219) + tt.return %0 : f32 loc(#loc220) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc221) + tt.return %1 : f32 loc(#loc221) + } loc(#loc218) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55) +#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66) +#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81) +#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57) +#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55) +#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63) +#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95) +#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42) +#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44) +#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55) +#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66) +#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81) +#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24) +#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24) +#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32) +#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53) +#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59) +#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91) +#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42) +#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24) +#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24) +#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33) +#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43) +#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39) +#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc231 = loc("xnumel"(#loc1)) +#loc232 = loc("r0_numel"(#loc2)) +#loc233 = loc("xoffset"(#loc3)) +#loc234 = loc("xoffset"(#loc4)) +#loc235 = loc("xindex"(#loc5)) +#loc236 = loc("xindex"(#loc6)) +#loc237 = loc("xindex"(#loc7)) +#loc238 = loc("xmask"(#loc8)) +#loc239 = loc("r0_base"(#loc9)) +#loc240 = loc("r0_base"(#loc10)) +#loc241 = loc("x0"(#loc11)) +#loc242 = loc("x1"(#loc12)) +#loc243 = loc("_tmp4"(#loc13)) +#loc244 = loc("_tmp10"(#loc14)) +#loc245 = loc("_tmp4"(#loc15)) +#loc246 = loc("r0_index"(#loc16)) +#loc247 = loc("r0_mask"(#loc17)) +#loc248 = loc("tmp0"(#loc18)) +#loc249 = loc("tmp0"(#loc19)) +#loc250 = loc("tmp0"(#loc20)) +#loc251 = loc("tmp0"(#loc21)) +#loc252 = loc("tmp0"(#loc22)) +#loc253 = loc("tmp0"(#loc23)) +#loc254 = loc("tmp0"(#loc24)) +#loc255 = loc("tmp0"(#loc25)) +#loc256 = loc("tmp6"(#loc26)) +#loc257 = loc("tmp6"(#loc27)) +#loc258 = loc("tmp6"(#loc28)) +#loc259 = loc("tmp6"(#loc29)) +#loc260 = loc("tmp6"(#loc30)) +#loc261 = loc("tmp6"(#loc31)) +#loc262 = loc("tmp6"(#loc32)) +#loc263 = loc("tmp2"(#loc33)) +#loc264 = loc("tmp5"(#loc34)) +#loc265 = loc("_tmp4"(#loc35)) +#loc266 = loc("tmp8"(#loc36)) +#loc267 = loc("tmp11"(#loc37)) +#loc268 = loc("_tmp10"(#loc38)) +#loc269 = loc("tmp4"(#loc40)) +#loc270 = loc("tmp4"(#loc41)) +#loc271 = loc("tmp10"(#loc42)) +#loc272 = loc("tmp10"(#loc43)) +#loc273 = loc("r0_index"(#loc45)) +#loc274 = loc("r0_mask"(#loc46)) +#loc275 = loc("r0_3"(#loc47)) +#loc276 = loc("r0_4"(#loc48)) +#loc277 = loc("tmp50"(#loc49)) +#loc278 = loc("tmp50"(#loc50)) +#loc279 = loc("tmp50"(#loc51)) +#loc280 = loc("tmp50"(#loc52)) +#loc281 = loc("tmp50"(#loc53)) +#loc282 = loc("tmp50"(#loc54)) +#loc283 = loc("tmp50"(#loc55)) +#loc284 = loc("tmp58"(#loc56)) +#loc285 = loc("tmp58"(#loc57)) +#loc286 = loc("tmp58"(#loc58)) +#loc287 = loc("tmp63"(#loc59)) +#loc288 = loc("tmp63"(#loc60)) +#loc289 = loc("tmp63"(#loc61)) +#loc290 = loc("tmp63"(#loc62)) +#loc291 = loc("tmp66"(#loc63)) +#loc292 = loc("tmp66"(#loc64)) +#loc293 = loc("tmp66"(#loc65)) +#loc294 = loc("tmp66"(#loc66)) +#loc295 = loc("tmp96"(#loc67)) +#loc296 = loc("tmp96"(#loc68)) +#loc297 = loc("tmp96"(#loc69)) +#loc298 = loc("tmp96"(#loc70)) +#loc299 = loc("tmp96"(#loc71)) +#loc300 = loc("tmp96"(#loc72)) +#loc301 = loc("tmp96"(#loc73)) +#loc302 = loc("tmp96"(#loc74)) +#loc303 = loc("tmp102"(#loc75)) +#loc304 = loc("tmp102"(#loc76)) +#loc305 = loc("tmp102"(#loc77)) +#loc306 = loc("tmp13"(#loc78)) +#loc307 = loc("tmp14"(#loc79)) +#loc308 = loc("tmp15"(#loc80)) +#loc309 = loc("tmp16"(#loc81)) +#loc310 = loc("tmp17"(#loc82)) +#loc311 = loc("tmp17"(#loc83)) +#loc312 = loc("tmp17"(#loc84)) +#loc313 = loc("tmp17"(#loc85)) +#loc314 = loc("tmp17"(#loc86)) +#loc315 = loc("tmp17"(#loc87)) +#loc316 = loc("tmp17"(#loc88)) +#loc317 = loc("tmp17"(#loc89)) +#loc318 = loc("tmp17"(#loc90)) +#loc319 = loc("tmp17"(#loc91)) +#loc320 = loc("tmp19"(#loc92)) +#loc321 = loc("tmp20"(#loc93)) +#loc322 = loc("tmp21"(#loc94)) +#loc323 = loc("tmp22"(#loc95)) +#loc324 = loc("tmp23"(#loc96)) +#loc325 = loc("tmp24"(#loc97)) +#loc326 = loc("tmp25"(#loc98)) +#loc327 = loc("tmp25"(#loc99)) +#loc328 = loc("tmp25"(#loc100)) +#loc329 = loc("tmp25"(#loc101)) +#loc330 = loc("tmp25"(#loc102)) +#loc331 = loc("tmp25"(#loc103)) +#loc332 = loc("tmp25"(#loc104)) +#loc333 = loc("tmp27"(#loc105)) +#loc334 = loc("tmp29"(#loc106)) +#loc335 = loc("tmp30"(#loc107)) +#loc336 = loc("tmp31"(#loc108)) +#loc337 = loc("tmp32"(#loc109)) +#loc338 = loc("tmp33"(#loc110)) +#loc339 = loc("tmp34"(#loc111)) +#loc340 = loc("tmp35"(#loc112)) +#loc341 = loc("tmp35"(#loc113)) +#loc342 = loc("tmp35"(#loc114)) +#loc343 = loc("tmp35"(#loc115)) +#loc344 = loc("tmp35"(#loc116)) +#loc345 = loc("tmp35"(#loc117)) +#loc346 = loc("tmp35"(#loc118)) +#loc347 = loc("tmp35"(#loc119)) +#loc348 = loc("tmp35"(#loc120)) +#loc349 = loc("tmp37"(#loc121)) +#loc350 = loc("tmp38"(#loc122)) +#loc351 = loc("tmp39"(#loc123)) +#loc352 = loc("tmp40"(#loc124)) +#loc353 = loc("tmp41"(#loc125)) +#loc354 = loc("tmp42"(#loc126)) +#loc355 = loc("tmp43"(#loc127)) +#loc356 = loc("tmp43"(#loc128)) +#loc357 = loc("tmp43"(#loc129)) +#loc358 = loc("tmp43"(#loc130)) +#loc359 = loc("tmp43"(#loc131)) +#loc360 = loc("tmp43"(#loc132)) +#loc361 = loc("tmp45"(#loc133)) +#loc362 = loc("tmp47"(#loc134)) +#loc363 = loc("tmp48"(#loc135)) +#loc364 = loc("tmp49"(#loc136)) +#loc365 = loc("tmp52"(#loc137)) +#loc366 = loc("tmp53"(#loc138)) +#loc367 = loc("tmp54"(#loc139)) +#loc368 = loc("tmp55"(#loc140)) +#loc369 = loc("tmp56"(#loc141)) +#loc370 = loc("tmp57"(#loc142)) +#loc371 = loc("tmp60"(#loc143)) +#loc372 = loc("tmp64"(#loc144)) +#loc373 = loc("tmp67"(#loc145)) +#loc374 = loc("tmp68"(#loc146)) +#loc375 = loc("tmp70"(#loc147)) +#loc376 = loc("tmp70"(#loc148)) +#loc377 = loc("tmp70"(#loc149)) +#loc378 = loc("tmp70"(#loc150)) +#loc379 = loc("tmp70"(#loc151)) +#loc380 = loc("tmp70"(#loc152)) +#loc381 = loc("tmp70"(#loc153)) +#loc382 = loc("tmp70"(#loc154)) +#loc383 = loc("tmp70"(#loc155)) +#loc384 = loc("tmp70"(#loc156)) +#loc385 = loc("tmp72"(#loc157)) +#loc386 = loc("tmp73"(#loc158)) +#loc387 = loc("tmp74"(#loc159)) +#loc388 = loc("tmp75"(#loc160)) +#loc389 = loc("tmp76"(#loc161)) +#loc390 = loc("tmp76"(#loc162)) +#loc391 = loc("tmp76"(#loc163)) +#loc392 = loc("tmp76"(#loc164)) +#loc393 = loc("tmp76"(#loc165)) +#loc394 = loc("tmp76"(#loc166)) +#loc395 = loc("tmp76"(#loc167)) +#loc396 = loc("tmp78"(#loc168)) +#loc397 = loc("tmp80"(#loc169)) +#loc398 = loc("tmp81"(#loc170)) +#loc399 = loc("tmp82"(#loc171)) +#loc400 = loc("tmp83"(#loc172)) +#loc401 = loc("tmp83"(#loc173)) +#loc402 = loc("tmp83"(#loc174)) +#loc403 = loc("tmp83"(#loc175)) +#loc404 = loc("tmp83"(#loc176)) +#loc405 = loc("tmp83"(#loc177)) +#loc406 = loc("tmp83"(#loc178)) +#loc407 = loc("tmp83"(#loc179)) +#loc408 = loc("tmp83"(#loc180)) +#loc409 = loc("tmp83"(#loc181)) +#loc410 = loc("tmp85"(#loc182)) +#loc411 = loc("tmp86"(#loc183)) +#loc412 = loc("tmp87"(#loc184)) +#loc413 = loc("tmp88"(#loc185)) +#loc414 = loc("tmp89"(#loc186)) +#loc415 = loc("tmp89"(#loc187)) +#loc416 = loc("tmp89"(#loc188)) +#loc417 = loc("tmp89"(#loc189)) +#loc418 = loc("tmp89"(#loc190)) +#loc419 = loc("tmp89"(#loc191)) +#loc420 = loc("tmp91"(#loc192)) +#loc421 = loc("tmp93"(#loc193)) +#loc422 = loc("tmp94"(#loc194)) +#loc423 = loc("tmp95"(#loc195)) +#loc424 = loc("tmp98"(#loc196)) +#loc425 = loc("tmp99"(#loc197)) +#loc426 = loc("tmp100"(#loc198)) +#loc427 = loc("tmp101"(#loc199)) +#loc428 = loc("tmp104"(#loc200)) +#loc429 = loc("tmp107"(#loc201)) +#loc430 = loc("tmp109"(#loc202)) +#loc431 = loc("tmp110"(#loc203)) +#loc435 = loc("_tmp10"(#loc245)) diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..0da48da30f25503de803c35d455f5e28fea932dd --- /dev/null +++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir @@ -0,0 +1,487 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc130 = loc("in_out_ptr0"(#loc)) +#loc131 = loc("in_out_ptr1"(#loc)) +#loc132 = loc("in_ptr0"(#loc)) +#loc133 = loc("in_ptr1"(#loc)) +#loc134 = loc("in_ptr2"(#loc)) +#loc135 = loc("in_ptr3"(#loc)) +#loc136 = loc("in_ptr4"(#loc)) +#loc137 = loc("xnumel"(#loc)) +#loc138 = loc("r0_numel"(#loc)) +#loc166 = loc("tmp4"(#loc30)) +#loc168 = loc("tmp10"(#loc33)) +#loc259 = loc(callsite(#loc1 at #loc166)) +#loc261 = loc(callsite(#loc1 at #loc168)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4097> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x128xbf16, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x128xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<2> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<36864> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<36864> : tensor<8x1xi32, #blocked1> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<8x1xi32, #blocked1> loc(#loc1) + %cst_8 = arith.constant dense<4096> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x128xi32, #blocked1> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<128> : tensor<1x128xi32, #blocked1> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<32> : tensor<8x1xi32, #blocked1> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked1> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked> loc(#loc1) + %cst_16 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32, #blocked1> loc(#loc1) + %cst_17 = arith.constant dense<1.280000e+02> : tensor<8x1xf32, #blocked1> loc(#loc1) + %cst_18 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked> loc(#loc1) + %cst_19 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc139) + %xoffset_20 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc140) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc141) + %xindex_21 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc141) + %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc141) + %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc141) + %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc142) + %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<8x1xi32, #blocked> loc(#loc142) + %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<8x1xi32, #blocked1> loc(#loc142) + %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<8x1xi32, #blocked> loc(#loc142) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc143) + %r0_base_28 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc143) + %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc143) + %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc143) + %x0 = arith.remsi %xindex_26, %cst_13 : tensor<8x1xi32, #blocked1> loc(#loc144) + %x0_31 = arith.remsi %xindex_27, %cst_12 : tensor<8x1xi32, #blocked> loc(#loc144) + %x1 = arith.divsi %xindex_26, %cst_13 : tensor<8x1xi32, #blocked1> loc(#loc145) + %x1_32 = arith.divsi %xindex_27, %cst_12 : tensor<8x1xi32, #blocked> loc(#loc145) + %r0_mask = arith.cmpi slt, %r0_base_29, %cst_11 : tensor<1x128xi32, #blocked1> loc(#loc146) + %r0_mask_33 = arith.cmpi slt, %r0_base_30, %cst_10 : tensor<1x128xi32, #blocked> loc(#loc146) + %tmp0 = arith.addi %r0_base_29, %cst_9 : tensor<1x128xi32, #blocked1> loc(#loc147) + %tmp0_34 = arith.muli %x0, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc148) + %tmp0_35 = arith.muli %x0_31, %cst_6 : tensor<8x1xi32, #blocked> loc(#loc148) + %tmp0_36 = tt.broadcast %tmp0 : tensor<1x128xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc149) + %tmp0_37 = tt.broadcast %tmp0_34 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc149) + %tmp0_38 = tt.broadcast %tmp0_35 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc149) + %tmp0_39 = arith.addi %tmp0_36, %tmp0_37 : tensor<8x128xi32, #blocked1> loc(#loc149) + %tmp0_40 = arith.muli %x1, %cst_5 : tensor<8x1xi32, #blocked1> loc(#loc150) + %tmp0_41 = arith.muli %x1_32, %cst_4 : tensor<8x1xi32, #blocked> loc(#loc150) + %tmp0_42 = tt.broadcast %tmp0_40 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc151) + %tmp0_43 = tt.broadcast %tmp0_41 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc151) + %tmp0_44 = arith.addi %tmp0_39, %tmp0_42 : tensor<8x128xi32, #blocked1> loc(#loc151) + %tmp0_45 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked1> loc(#loc152) + %tmp0_46 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked> loc(#loc152) + %tmp0_47 = tt.addptr %tmp0_45, %tmp0_44 : tensor<8x128x!tt.ptr, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc152) + %tmp0_48 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked1> -> tensor<8x128xi1, #blocked1> loc(#loc153) + %tmp0_49 = tt.load %tmp0_47, %tmp0_48, %cst_14 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked1> loc(#loc153) + %tmp0_50 = arith.extf %tmp0_49 : tensor<8x128xbf16, #blocked1> to tensor<8x128xf32, #blocked1> loc(#loc154) + %tmp6 = tt.broadcast %r0_base_29 : tensor<1x128xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc155) + %tmp6_51 = arith.addi %tmp6, %tmp0_37 : tensor<8x128xi32, #blocked1> loc(#loc155) + %tmp6_52 = arith.addi %tmp6_51, %tmp0_42 : tensor<8x128xi32, #blocked1> loc(#loc156) + %tmp6_53 = tt.addptr %tmp0_45, %tmp6_52 : tensor<8x128x!tt.ptr, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc157) + %tmp6_54 = tt.load %tmp6_53, %tmp0_48, %cst_14 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked1> loc(#loc158) + %tmp6_55 = arith.extf %tmp6_54 : tensor<8x128xbf16, #blocked1> to tensor<8x128xf32, #blocked1> loc(#loc159) + %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<8x128xf32, #blocked1> loc(#loc160) + %tmp5 = arith.addf %tmp2, %cst_19 : tensor<8x128xf32, #blocked1> loc(#loc161) + %_tmp4 = arith.select %tmp0_48, %tmp5, %cst_19 : tensor<8x128xi1, #blocked1>, tensor<8x128xf32, #blocked1> loc(#loc162) + %tmp8 = arith.mulf %tmp6_55, %tmp6_55 : tensor<8x128xf32, #blocked1> loc(#loc163) + %tmp11 = arith.addf %tmp8, %cst_19 : tensor<8x128xf32, #blocked1> loc(#loc164) + %_tmp10 = arith.select %tmp0_48, %tmp11, %cst_19 : tensor<8x128xi1, #blocked1>, tensor<8x128xf32, #blocked1> loc(#loc165) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_127: f32 loc(callsite(#loc1 at #loc166)), %tmp4_128: f32 loc(callsite(#loc1 at #loc166))): + %tmp4_129 = arith.addf %tmp4_127, %tmp4_128 : f32 loc(#loc264) + tt.reduce.return %tmp4_129 : f32 loc(#loc258) + }) : (tensor<8x128xf32, #blocked1>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc258) + %tmp4_56 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc167) + %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_127: f32 loc(callsite(#loc1 at #loc168)), %tmp10_128: f32 loc(callsite(#loc1 at #loc168))): + %tmp10_129 = arith.addf %tmp10_127, %tmp10_128 : f32 loc(#loc265) + tt.reduce.return %tmp10_129 : f32 loc(#loc260) + }) : (tensor<8x128xf32, #blocked1>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc260) + %tmp10_57 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc169) + %r0_3 = arith.remsi %r0_base_30, %cst_3 : tensor<1x128xi32, #blocked> loc(#loc170) + %r0_4 = arith.divsi %r0_base_30, %cst_3 : tensor<1x128xi32, #blocked> loc(#loc171) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc172) + %tmp58_58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked1> loc(#loc172) + %tmp58_59 = tt.addptr %tmp58_58, %r0_base_29 : tensor<1x128x!tt.ptr, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc172) + %tmp58_60 = tt.load %tmp58_59, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x128x!tt.ptr, #blocked1> loc(#loc173) + %tmp58_61 = arith.extf %tmp58_60 : tensor<1x128xbf16, #blocked1> to tensor<1x128xf32, #blocked1> loc(#loc174) + %tmp63 = arith.muli %x1, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc175) + %tmp63_62 = tt.broadcast %tmp63 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc176) + %tmp63_63 = arith.addi %tmp6, %tmp63_62 : tensor<8x128xi32, #blocked1> loc(#loc176) + %tmp63_64 = tt.splat %in_ptr2 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked1> loc(#loc177) + %tmp63_65 = tt.addptr %tmp63_64, %tmp63_63 : tensor<8x128x!tt.ptr, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc177) + %tmp63_66 = tt.load %tmp63_65, %tmp0_48, %cst_19 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked1> loc(#loc178) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked1> loc(#loc179) + %tmp66_67 = tt.addptr %tmp66, %tmp63_63 : tensor<8x128x!tt.ptr, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc179) + %tmp66_68 = tt.load %tmp66_67, %tmp0_48, %cst_19 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked1> loc(#loc180) + %tmp66_69 = ttg.convert_layout %tmp66_68 : tensor<8x128xf32, #blocked1> -> tensor<8x128xf32, #blocked> loc(#loc180) + %tmp96 = tt.load %tmp0_47, %tmp0_48, %cst_14 evictionPolicy = evict_first : tensor<8x128x!tt.ptr, #blocked1> loc(#loc181) + %tmp96_70 = arith.extf %tmp96 : tensor<8x128xbf16, #blocked1> to tensor<8x128xf32, #blocked1> loc(#loc182) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc183) + %tmp102_71 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked1> loc(#loc183) + %tmp102_72 = tt.addptr %tmp102_71, %r0_base_29 : tensor<1x128x!tt.ptr, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc183) + %tmp102_73 = tt.load %tmp102_72, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x128x!tt.ptr, #blocked1> loc(#loc184) + %tmp102_74 = arith.extf %tmp102_73 : tensor<1x128xbf16, #blocked1> to tensor<1x128xf32, #blocked1> loc(#loc185) + %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked> loc(#loc186) + %tmp16_75 = arith.cmpi slt, %tmp16, %cst_2 : tensor<1x128xi64, #blocked> loc(#loc186) + %tmp17 = arith.muli %r0_4, %cst_3 : tensor<1x128xi32, #blocked> loc(#loc187) + %tmp17_76 = arith.addi %tmp17, %cst_1 : tensor<1x128xi32, #blocked> loc(#loc188) + %tmp17_77 = tt.broadcast %tmp17_76 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc189) + %tmp17_78 = arith.addi %tmp17_77, %tmp0_38 : tensor<8x128xi32, #blocked> loc(#loc189) + %tmp17_79 = arith.addi %tmp17_78, %tmp0_43 : tensor<8x128xi32, #blocked> loc(#loc190) + %tmp17_80 = tt.addptr %tmp0_46, %tmp17_79 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi32, #blocked> loc(#loc191) + %tmp17_81 = arith.andi %r0_mask_33, %tmp16_75 : tensor<1x128xi1, #blocked> loc(#loc192) + %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc193) + %tmp17_83 = tt.load %tmp17_80, %tmp17_82, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked> loc(#loc193) + %tmp17_84 = arith.extf %tmp17_83 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc194) + %tmp20 = arith.divf %tmp10_57, %cst_17 : tensor<8x1xf32, #blocked1> loc(#loc195) + %tmp22 = arith.addf %tmp20, %cst_16 : tensor<8x1xf32, #blocked1> loc(#loc196) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32, #blocked1>) -> tensor<8x1xf32, #blocked1> loc(#loc197) + %tmp24 = ttg.convert_layout %tmp23 : tensor<8x1xf32, #blocked1> -> tensor<8x1xf32, #blocked> loc(#loc198) + %tmp24_85 = tt.broadcast %tmp24 : tensor<8x1xf32, #blocked> -> tensor<8x128xf32, #blocked> loc(#loc198) + %tmp24_86 = tt.broadcast %tmp23 : tensor<8x1xf32, #blocked1> -> tensor<8x128xf32, #blocked1> loc(#loc198) + %tmp24_87 = arith.mulf %tmp17_84, %tmp24_85 : tensor<8x128xf32, #blocked> loc(#loc198) + %tmp25 = tt.addptr %tmp58, %tmp17_76 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc199) + %tmp25_88 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr, #blocked> -> tensor<8x128x!tt.ptr, #blocked> loc(#loc199) + %tmp25_89 = tt.load %tmp25_88, %tmp17_82, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked> loc(#loc200) + %tmp25_90 = arith.extf %tmp25_89 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc201) + %tmp27 = arith.mulf %tmp24_87, %tmp25_90 : tensor<8x128xf32, #blocked> loc(#loc202) + %tmp29 = arith.subf %cst_18, %tmp27 : tensor<8x128xf32, #blocked> loc(#loc203) + %tmp31 = tt.broadcast %tmp16_75 : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc204) + %tmp32 = arith.cmpi sge, %tmp16, %cst_2 : tensor<1x128xi64, #blocked> loc(#loc205) + %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc206) + %tmp35_91 = arith.addi %tmp35, %tmp0_38 : tensor<8x128xi32, #blocked> loc(#loc206) + %tmp35_92 = arith.addi %tmp35_91, %tmp0_43 : tensor<8x128xi32, #blocked> loc(#loc207) + %tmp35_93 = tt.addptr %tmp0_46, %tmp35_92 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi32, #blocked> loc(#loc208) + %tmp35_94 = arith.andi %r0_mask_33, %tmp32 : tensor<1x128xi1, #blocked> loc(#loc209) + %tmp35_95 = tt.broadcast %tmp35_94 : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc210) + %tmp35_96 = tt.load %tmp35_93, %tmp35_95, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked> loc(#loc210) + %tmp35_97 = arith.extf %tmp35_96 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc211) + %tmp42 = arith.mulf %tmp35_97, %tmp24_85 : tensor<8x128xf32, #blocked> loc(#loc212) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc213) + %tmp43_98 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr, #blocked> -> tensor<8x128x!tt.ptr, #blocked> loc(#loc213) + %tmp43_99 = tt.load %tmp43_98, %tmp35_95, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked> loc(#loc214) + %tmp43_100 = arith.extf %tmp43_99 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc215) + %tmp45 = arith.mulf %tmp42, %tmp43_100 : tensor<8x128xf32, #blocked> loc(#loc216) + %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc217) + %tmp48_101 = arith.select %tmp48, %tmp45, %cst_18 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc217) + %tmp49 = arith.select %tmp31, %tmp29, %tmp48_101 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc262) + %tmp57 = arith.mulf %tmp6_55, %tmp24_86 : tensor<8x128xf32, #blocked1> loc(#loc219) + %tmp60 = tt.broadcast %tmp58_61 : tensor<1x128xf32, #blocked1> -> tensor<8x128xf32, #blocked1> loc(#loc220) + %tmp60_102 = arith.mulf %tmp57, %tmp60 : tensor<8x128xf32, #blocked1> loc(#loc220) + %tmp64 = arith.mulf %tmp60_102, %tmp63_66 : tensor<8x128xf32, #blocked1> loc(#loc221) + %tmp64_103 = ttg.convert_layout %tmp64 : tensor<8x128xf32, #blocked1> -> tensor<8x128xf32, #blocked> loc(#loc221) + %tmp67 = arith.mulf %tmp49, %tmp66_69 : tensor<8x128xf32, #blocked> loc(#loc222) + %tmp68 = arith.addf %tmp64_103, %tmp67 : tensor<8x128xf32, #blocked> loc(#loc223) + %tmp70 = arith.addi %tmp17, %cst : tensor<1x128xi32, #blocked> loc(#loc224) + %tmp70_104 = tt.broadcast %tmp70 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc225) + %tmp70_105 = arith.addi %tmp70_104, %tmp0_38 : tensor<8x128xi32, #blocked> loc(#loc225) + %tmp70_106 = arith.addi %tmp70_105, %tmp0_43 : tensor<8x128xi32, #blocked> loc(#loc226) + %tmp70_107 = tt.addptr %tmp0_46, %tmp70_106 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi32, #blocked> loc(#loc227) + %tmp70_108 = tt.load %tmp70_107, %tmp17_82, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked> loc(#loc228) + %tmp70_109 = arith.extf %tmp70_108 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc229) + %tmp72 = arith.divf %tmp4_56, %cst_17 : tensor<8x1xf32, #blocked1> loc(#loc230) + %tmp73 = arith.addf %tmp72, %cst_16 : tensor<8x1xf32, #blocked1> loc(#loc231) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32, #blocked1>) -> tensor<8x1xf32, #blocked1> loc(#loc232) + %tmp75 = ttg.convert_layout %tmp74 : tensor<8x1xf32, #blocked1> -> tensor<8x1xf32, #blocked> loc(#loc233) + %tmp75_110 = tt.broadcast %tmp75 : tensor<8x1xf32, #blocked> -> tensor<8x128xf32, #blocked> loc(#loc233) + %tmp75_111 = tt.broadcast %tmp74 : tensor<8x1xf32, #blocked1> -> tensor<8x128xf32, #blocked1> loc(#loc233) + %tmp75_112 = arith.mulf %tmp70_109, %tmp75_110 : tensor<8x128xf32, #blocked> loc(#loc233) + %tmp76 = tt.addptr %tmp102, %tmp17_76 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc234) + %tmp76_113 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr, #blocked> -> tensor<8x128x!tt.ptr, #blocked> loc(#loc234) + %tmp76_114 = tt.load %tmp76_113, %tmp17_82, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked> loc(#loc235) + %tmp76_115 = arith.extf %tmp76_114 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc236) + %tmp78 = arith.mulf %tmp75_112, %tmp76_115 : tensor<8x128xf32, #blocked> loc(#loc237) + %tmp80 = arith.subf %cst_18, %tmp78 : tensor<8x128xf32, #blocked> loc(#loc238) + %tmp83 = arith.addi %tmp17, %cst_8 : tensor<1x128xi32, #blocked> loc(#loc239) + %tmp83_116 = tt.broadcast %tmp83 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc240) + %tmp83_117 = arith.addi %tmp83_116, %tmp0_38 : tensor<8x128xi32, #blocked> loc(#loc240) + %tmp83_118 = arith.addi %tmp83_117, %tmp0_43 : tensor<8x128xi32, #blocked> loc(#loc241) + %tmp83_119 = tt.addptr %tmp0_46, %tmp83_118 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi32, #blocked> loc(#loc242) + %tmp83_120 = tt.load %tmp83_119, %tmp35_95, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked> loc(#loc243) + %tmp83_121 = arith.extf %tmp83_120 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc244) + %tmp88 = arith.mulf %tmp83_121, %tmp75_110 : tensor<8x128xf32, #blocked> loc(#loc245) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc246) + %tmp89_122 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr, #blocked> -> tensor<8x128x!tt.ptr, #blocked> loc(#loc246) + %tmp89_123 = tt.load %tmp89_122, %tmp35_95, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked> loc(#loc247) + %tmp89_124 = arith.extf %tmp89_123 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc248) + %tmp91 = arith.mulf %tmp88, %tmp89_124 : tensor<8x128xf32, #blocked> loc(#loc249) + %tmp94 = arith.select %tmp48, %tmp91, %cst_18 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc250) + %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc263) + %tmp101 = arith.mulf %tmp96_70, %tmp75_111 : tensor<8x128xf32, #blocked1> loc(#loc253) + %tmp104 = tt.broadcast %tmp102_74 : tensor<1x128xf32, #blocked1> -> tensor<8x128xf32, #blocked1> loc(#loc254) + %tmp104_125 = arith.mulf %tmp101, %tmp104 : tensor<8x128xf32, #blocked1> loc(#loc254) + %tmp107 = arith.mulf %tmp104_125, %tmp63_66 : tensor<8x128xf32, #blocked1> loc(#loc255) + %tmp107_126 = ttg.convert_layout %tmp107 : tensor<8x128xf32, #blocked1> -> tensor<8x128xf32, #blocked> loc(#loc255) + %tmp109 = arith.mulf %tmp95, %tmp66_69 : tensor<8x128xf32, #blocked> loc(#loc256) + %tmp110 = arith.addf %tmp107_126, %tmp109 : tensor<8x128xf32, #blocked> loc(#loc257) + %0 = arith.muli %xindex_26, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc123) + %1 = tt.broadcast %0 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc124) + %2 = arith.addi %tmp6, %1 : tensor<8x128xi32, #blocked1> loc(#loc124) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked1> loc(#loc125) + %4 = tt.addptr %3, %2 : tensor<8x128x!tt.ptr, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc125) + %5 = arith.truncf %tmp68 : tensor<8x128xf32, #blocked> to tensor<8x128xbf16, #blocked> loc(#loc126) + %6 = ttg.convert_layout %5 : tensor<8x128xbf16, #blocked> -> tensor<8x128xbf16, #blocked1> loc(#loc126) + tt.store %4, %6, %tmp0_48 : tensor<8x128x!tt.ptr, #blocked1> loc(#loc126) + %7 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked1> loc(#loc127) + %8 = tt.addptr %7, %2 : tensor<8x128x!tt.ptr, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc127) + %9 = arith.truncf %tmp110 : tensor<8x128xf32, #blocked> to tensor<8x128xbf16, #blocked> loc(#loc128) + %10 = ttg.convert_layout %9 : tensor<8x128xbf16, #blocked> -> tensor<8x128xbf16, #blocked1> loc(#loc128) + tt.store %8, %10, %tmp0_48 : tensor<8x128x!tt.ptr, #blocked1> loc(#loc128) + tt.return loc(#loc129) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc139 = loc("xoffset"(#loc2)) +#loc140 = loc("xoffset"(#loc3)) +#loc141 = loc("xindex"(#loc4)) +#loc142 = loc("xindex"(#loc5)) +#loc143 = loc("r0_base"(#loc6)) +#loc144 = loc("x0"(#loc7)) +#loc145 = loc("x1"(#loc8)) +#loc146 = loc("r0_mask"(#loc9)) +#loc147 = loc("tmp0"(#loc10)) +#loc148 = loc("tmp0"(#loc11)) +#loc149 = loc("tmp0"(#loc12)) +#loc150 = loc("tmp0"(#loc13)) +#loc151 = loc("tmp0"(#loc14)) +#loc152 = loc("tmp0"(#loc15)) +#loc153 = loc("tmp0"(#loc16)) +#loc154 = loc("tmp0"(#loc17)) +#loc155 = loc("tmp6"(#loc18)) +#loc156 = loc("tmp6"(#loc19)) +#loc157 = loc("tmp6"(#loc20)) +#loc158 = loc("tmp6"(#loc21)) +#loc159 = loc("tmp6"(#loc22)) +#loc160 = loc("tmp2"(#loc23)) +#loc161 = loc("tmp5"(#loc24)) +#loc162 = loc("_tmp4"(#loc25)) +#loc163 = loc("tmp8"(#loc26)) +#loc164 = loc("tmp11"(#loc27)) +#loc165 = loc("_tmp10"(#loc28)) +#loc167 = loc("tmp4"(#loc32)) +#loc169 = loc("tmp10"(#loc34)) +#loc170 = loc("r0_3"(#loc35)) +#loc171 = loc("r0_4"(#loc36)) +#loc172 = loc("tmp58"(#loc37)) +#loc173 = loc("tmp58"(#loc38)) +#loc174 = loc("tmp58"(#loc39)) +#loc175 = loc("tmp63"(#loc40)) +#loc176 = loc("tmp63"(#loc41)) +#loc177 = loc("tmp63"(#loc42)) +#loc178 = loc("tmp63"(#loc43)) +#loc179 = loc("tmp66"(#loc44)) +#loc180 = loc("tmp66"(#loc45)) +#loc181 = loc("tmp96"(#loc46)) +#loc182 = loc("tmp96"(#loc47)) +#loc183 = loc("tmp102"(#loc48)) +#loc184 = loc("tmp102"(#loc49)) +#loc185 = loc("tmp102"(#loc50)) +#loc186 = loc("tmp16"(#loc51)) +#loc187 = loc("tmp17"(#loc52)) +#loc188 = loc("tmp17"(#loc53)) +#loc189 = loc("tmp17"(#loc54)) +#loc190 = loc("tmp17"(#loc55)) +#loc191 = loc("tmp17"(#loc56)) +#loc192 = loc("tmp17"(#loc57)) +#loc193 = loc("tmp17"(#loc58)) +#loc194 = loc("tmp17"(#loc59)) +#loc195 = loc("tmp20"(#loc60)) +#loc196 = loc("tmp22"(#loc61)) +#loc197 = loc("tmp23"(#loc62)) +#loc198 = loc("tmp24"(#loc63)) +#loc199 = loc("tmp25"(#loc64)) +#loc200 = loc("tmp25"(#loc65)) +#loc201 = loc("tmp25"(#loc66)) +#loc202 = loc("tmp27"(#loc67)) +#loc203 = loc("tmp29"(#loc68)) +#loc204 = loc("tmp31"(#loc69)) +#loc205 = loc("tmp32"(#loc70)) +#loc206 = loc("tmp35"(#loc71)) +#loc207 = loc("tmp35"(#loc72)) +#loc208 = loc("tmp35"(#loc73)) +#loc209 = loc("tmp35"(#loc74)) +#loc210 = loc("tmp35"(#loc75)) +#loc211 = loc("tmp35"(#loc76)) +#loc212 = loc("tmp42"(#loc77)) +#loc213 = loc("tmp43"(#loc78)) +#loc214 = loc("tmp43"(#loc79)) +#loc215 = loc("tmp43"(#loc80)) +#loc216 = loc("tmp45"(#loc81)) +#loc217 = loc("tmp48"(#loc82)) +#loc218 = loc("tmp49"(#loc83)) +#loc219 = loc("tmp57"(#loc84)) +#loc220 = loc("tmp60"(#loc85)) +#loc221 = loc("tmp64"(#loc86)) +#loc222 = loc("tmp67"(#loc87)) +#loc223 = loc("tmp68"(#loc88)) +#loc224 = loc("tmp70"(#loc89)) +#loc225 = loc("tmp70"(#loc90)) +#loc226 = loc("tmp70"(#loc91)) +#loc227 = loc("tmp70"(#loc92)) +#loc228 = loc("tmp70"(#loc93)) +#loc229 = loc("tmp70"(#loc94)) +#loc230 = loc("tmp72"(#loc95)) +#loc231 = loc("tmp73"(#loc96)) +#loc232 = loc("tmp74"(#loc97)) +#loc233 = loc("tmp75"(#loc98)) +#loc234 = loc("tmp76"(#loc99)) +#loc235 = loc("tmp76"(#loc100)) +#loc236 = loc("tmp76"(#loc101)) +#loc237 = loc("tmp78"(#loc102)) +#loc238 = loc("tmp80"(#loc103)) +#loc239 = loc("tmp83"(#loc104)) +#loc240 = loc("tmp83"(#loc105)) +#loc241 = loc("tmp83"(#loc106)) +#loc242 = loc("tmp83"(#loc107)) +#loc243 = loc("tmp83"(#loc108)) +#loc244 = loc("tmp83"(#loc109)) +#loc245 = loc("tmp88"(#loc110)) +#loc246 = loc("tmp89"(#loc111)) +#loc247 = loc("tmp89"(#loc112)) +#loc248 = loc("tmp89"(#loc113)) +#loc249 = loc("tmp91"(#loc114)) +#loc250 = loc("tmp94"(#loc115)) +#loc251 = loc("tmp95"(#loc116)) +#loc252 = loc("tmp82"(#loc117)) +#loc253 = loc("tmp101"(#loc118)) +#loc254 = loc("tmp104"(#loc119)) +#loc255 = loc("tmp107"(#loc120)) +#loc256 = loc("tmp109"(#loc121)) +#loc257 = loc("tmp110"(#loc122)) +#loc258 = loc(callsite(#loc29 at #loc166)) +#loc260 = loc(callsite(#loc29 at #loc168)) +#loc262 = loc(fused[#loc218, #loc204]) +#loc263 = loc(fused[#loc251, #loc252]) +#loc264 = loc(callsite(#loc31 at #loc258)) +#loc265 = loc(callsite(#loc31 at #loc260)) diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c72b2af97edc08cdcc9a95b74c00153c29f759d2 --- /dev/null +++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir @@ -0,0 +1,457 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc132 = loc("in_out_ptr0"(#loc)) +#loc133 = loc("in_out_ptr1"(#loc)) +#loc134 = loc("in_ptr0"(#loc)) +#loc135 = loc("in_ptr1"(#loc)) +#loc136 = loc("in_ptr2"(#loc)) +#loc137 = loc("in_ptr3"(#loc)) +#loc138 = loc("in_ptr4"(#loc)) +#loc139 = loc("xnumel"(#loc)) +#loc140 = loc("r0_numel"(#loc)) +#loc170 = loc("tmp4"(#loc32)) +#loc172 = loc("tmp10"(#loc35)) +#loc263 = loc(callsite(#loc1 at #loc170)) +#loc265 = loc(callsite(#loc1 at #loc172)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x128xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16> loc(#loc1) + %cst_1 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc1) + %cst_6 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc1) + %cst_7 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc141) + %xoffset_13 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc142) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc143) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc144) + %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<8x1xi32> loc(#loc145) + %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<8x1xi32> loc(#loc145) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc146) + %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc147) + %x0 = arith.remsi %xindex_16, %cst_12 : tensor<8x1xi32> loc(#loc148) + %x1 = arith.divsi %xindex_16, %cst_12 : tensor<8x1xi32> loc(#loc149) + %r0_mask = arith.cmpi slt, %r0_base_17, %cst_10 : tensor<1x128xi32> loc(#loc150) + %tmp0 = arith.addi %r0_base_17, %cst_9 : tensor<1x128xi32> loc(#loc151) + %tmp0_18 = arith.muli %x0, %cst_8 : tensor<8x1xi32> loc(#loc152) + %tmp0_19 = tt.broadcast %tmp0 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc153) + %tmp0_20 = tt.broadcast %tmp0_18 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc153) + %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<8x128xi32> loc(#loc153) + %tmp0_22 = arith.muli %x1, %cst_7 : tensor<8x1xi32> loc(#loc154) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc155) + %tmp0_24 = arith.addi %tmp0_21, %tmp0_23 : tensor<8x128xi32> loc(#loc155) + %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc156) + %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc156) + %tmp0_27 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc157) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc157) + %tmp0_29 = arith.extf %tmp0_28 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc158) + %tmp6 = tt.broadcast %r0_base_17 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc159) + %tmp6_30 = arith.addi %tmp6, %tmp0_20 : tensor<8x128xi32> loc(#loc159) + %tmp6_31 = arith.addi %tmp6_30, %tmp0_23 : tensor<8x128xi32> loc(#loc160) + %tmp6_32 = tt.addptr %tmp0_25, %tmp6_31 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc161) + %tmp6_33 = tt.load %tmp6_32, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc162) + %tmp6_34 = arith.extf %tmp6_33 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc163) + %tmp2 = arith.mulf %tmp0_29, %tmp0_29 : tensor<8x128xf32> loc(#loc164) + %tmp5 = arith.addf %tmp2, %cst_11 : tensor<8x128xf32> loc(#loc165) + %_tmp4 = arith.select %tmp0_27, %tmp5, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc166) + %tmp8 = arith.mulf %tmp6_34, %tmp6_34 : tensor<8x128xf32> loc(#loc167) + %tmp11 = arith.addf %tmp8, %cst_11 : tensor<8x128xf32> loc(#loc168) + %_tmp10 = arith.select %tmp0_27, %tmp11, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc169) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_98: f32 loc(callsite(#loc1 at #loc170)), %tmp4_99: f32 loc(callsite(#loc1 at #loc170))): + %tmp4_100 = arith.addf %tmp4_98, %tmp4_99 : f32 loc(#loc266) + tt.reduce.return %tmp4_100 : f32 loc(#loc262) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc262) + %tmp4_35 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc171) + %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_98: f32 loc(callsite(#loc1 at #loc172)), %tmp10_99: f32 loc(callsite(#loc1 at #loc172))): + %tmp10_100 = arith.addf %tmp10_98, %tmp10_99 : f32 loc(#loc267) + tt.reduce.return %tmp10_100 : f32 loc(#loc264) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc264) + %tmp10_36 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc173) + %r0_3 = arith.remsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc174) + %r0_4 = arith.divsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc175) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc176) + %tmp58_37 = tt.addptr %tmp58, %r0_base_17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc176) + %tmp58_38 = tt.load %tmp58_37, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc177) + %tmp58_39 = arith.extf %tmp58_38 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc178) + %tmp63 = arith.muli %x1, %cst_8 : tensor<8x1xi32> loc(#loc179) + %tmp63_40 = tt.broadcast %tmp63 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc180) + %tmp63_41 = arith.addi %tmp6, %tmp63_40 : tensor<8x128xi32> loc(#loc180) + %tmp63_42 = tt.splat %in_ptr2 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc181) + %tmp63_43 = tt.addptr %tmp63_42, %tmp63_41 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc181) + %tmp63_44 = tt.load %tmp63_43, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc182) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc183) + %tmp66_45 = tt.addptr %tmp66, %tmp63_41 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc183) + %tmp66_46 = tt.load %tmp66_45, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc184) + %tmp96 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc185) + %tmp96_47 = arith.extf %tmp96 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc186) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc187) + %tmp102_48 = tt.addptr %tmp102, %r0_base_17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc187) + %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc188) + %tmp102_50 = arith.extf %tmp102_49 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc189) + %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc190) + %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc190) + %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x128xi32> loc(#loc191) + %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x128xi32> loc(#loc192) + %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc193) + %tmp17_54 = arith.addi %tmp17_53, %tmp0_20 : tensor<8x128xi32> loc(#loc193) + %tmp17_55 = arith.addi %tmp17_54, %tmp0_23 : tensor<8x128xi32> loc(#loc194) + %tmp17_56 = tt.addptr %tmp0_25, %tmp17_55 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc195) + %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x128xi1> loc(#loc196) + %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc197) + %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc197) + %tmp17_60 = arith.extf %tmp17_59 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc198) + %tmp20 = arith.divf %tmp10_36, %cst_3 : tensor<8x1xf32> loc(#loc199) + %tmp22 = arith.addf %tmp20, %cst_2 : tensor<8x1xf32> loc(#loc200) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc201) + %tmp24 = tt.broadcast %tmp23 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc202) + %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<8x128xf32> loc(#loc202) + %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc203) + %tmp25_62 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr> -> tensor<8x128x!tt.ptr> loc(#loc203) + %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc204) + %tmp25_64 = arith.extf %tmp25_63 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc205) + %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<8x128xf32> loc(#loc206) + %tmp29 = arith.subf %cst_11, %tmp27 : tensor<8x128xf32> loc(#loc207) + %tmp31 = tt.broadcast %tmp16_51 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc208) + %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc208) + %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc209) + %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc210) + %tmp35_66 = arith.addi %tmp35, %tmp0_20 : tensor<8x128xi32> loc(#loc210) + %tmp35_67 = arith.addi %tmp35_66, %tmp0_23 : tensor<8x128xi32> loc(#loc211) + %tmp35_68 = tt.addptr %tmp0_25, %tmp35_67 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc212) + %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x128xi1> loc(#loc213) + %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc214) + %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc214) + %tmp35_72 = arith.extf %tmp35_71 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc215) + %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<8x128xf32> loc(#loc216) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc217) + %tmp43_73 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr> -> tensor<8x128x!tt.ptr> loc(#loc217) + %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc218) + %tmp43_75 = arith.extf %tmp43_74 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc219) + %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<8x128xf32> loc(#loc220) + %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc221) + %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc221) + %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc222) + %tmp57 = arith.mulf %tmp6_34, %tmp24 : tensor<8x128xf32> loc(#loc223) + %tmp60 = tt.broadcast %tmp58_39 : tensor<1x128xf32> -> tensor<8x128xf32> loc(#loc224) + %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<8x128xf32> loc(#loc224) + %tmp64 = arith.mulf %tmp60_77, %tmp63_44 : tensor<8x128xf32> loc(#loc225) + %tmp67 = arith.mulf %tmp49, %tmp66_46 : tensor<8x128xf32> loc(#loc226) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<8x128xf32> loc(#loc227) + %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x128xi32> loc(#loc228) + %tmp70_78 = tt.broadcast %tmp70 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc229) + %tmp70_79 = arith.addi %tmp70_78, %tmp0_20 : tensor<8x128xi32> loc(#loc229) + %tmp70_80 = arith.addi %tmp70_79, %tmp0_23 : tensor<8x128xi32> loc(#loc230) + %tmp70_81 = tt.addptr %tmp0_25, %tmp70_80 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc231) + %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc232) + %tmp70_83 = arith.extf %tmp70_82 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc233) + %tmp72 = arith.divf %tmp4_35, %cst_3 : tensor<8x1xf32> loc(#loc234) + %tmp73 = arith.addf %tmp72, %cst_2 : tensor<8x1xf32> loc(#loc235) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc236) + %tmp75 = tt.broadcast %tmp74 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc237) + %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<8x128xf32> loc(#loc237) + %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc238) + %tmp76_85 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr> -> tensor<8x128x!tt.ptr> loc(#loc238) + %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc239) + %tmp76_87 = arith.extf %tmp76_86 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc240) + %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<8x128xf32> loc(#loc241) + %tmp80 = arith.subf %cst_11, %tmp78 : tensor<8x128xf32> loc(#loc242) + %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc243) + %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x128xi32> loc(#loc244) + %tmp83_88 = tt.broadcast %tmp83 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc245) + %tmp83_89 = arith.addi %tmp83_88, %tmp0_20 : tensor<8x128xi32> loc(#loc245) + %tmp83_90 = arith.addi %tmp83_89, %tmp0_23 : tensor<8x128xi32> loc(#loc246) + %tmp83_91 = tt.addptr %tmp0_25, %tmp83_90 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc247) + %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc248) + %tmp83_93 = arith.extf %tmp83_92 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc249) + %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<8x128xf32> loc(#loc250) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc251) + %tmp89_94 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr> -> tensor<8x128x!tt.ptr> loc(#loc251) + %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc252) + %tmp89_96 = arith.extf %tmp89_95 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc253) + %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<8x128xf32> loc(#loc254) + %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc255) + %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc256) + %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<8x128xf32> loc(#loc257) + %tmp104 = tt.broadcast %tmp102_50 : tensor<1x128xf32> -> tensor<8x128xf32> loc(#loc258) + %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<8x128xf32> loc(#loc258) + %tmp107 = arith.mulf %tmp104_97, %tmp63_44 : tensor<8x128xf32> loc(#loc259) + %tmp109 = arith.mulf %tmp95, %tmp66_46 : tensor<8x128xf32> loc(#loc260) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<8x128xf32> loc(#loc261) + %0 = arith.muli %xindex_16, %cst_8 : tensor<8x1xi32> loc(#loc125) + %1 = tt.broadcast %0 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc126) + %2 = arith.addi %tmp6, %1 : tensor<8x128xi32> loc(#loc126) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc127) + %4 = tt.addptr %3, %2 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc127) + %5 = arith.truncf %tmp68 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc128) + tt.store %4, %5, %tmp0_27 : tensor<8x128x!tt.ptr> loc(#loc128) + %6 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc129) + %7 = tt.addptr %6, %2 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc129) + %8 = arith.truncf %tmp110 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc130) + tt.store %7, %8, %tmp0_27 : tensor<8x128x!tt.ptr> loc(#loc130) + tt.return loc(#loc131) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc141 = loc("xoffset"(#loc2)) +#loc142 = loc("xoffset"(#loc3)) +#loc143 = loc("xindex"(#loc4)) +#loc144 = loc("xindex"(#loc5)) +#loc145 = loc("xindex"(#loc6)) +#loc146 = loc("r0_base"(#loc7)) +#loc147 = loc("r0_base"(#loc8)) +#loc148 = loc("x0"(#loc9)) +#loc149 = loc("x1"(#loc10)) +#loc150 = loc("r0_mask"(#loc11)) +#loc151 = loc("tmp0"(#loc12)) +#loc152 = loc("tmp0"(#loc13)) +#loc153 = loc("tmp0"(#loc14)) +#loc154 = loc("tmp0"(#loc15)) +#loc155 = loc("tmp0"(#loc16)) +#loc156 = loc("tmp0"(#loc17)) +#loc157 = loc("tmp0"(#loc18)) +#loc158 = loc("tmp0"(#loc19)) +#loc159 = loc("tmp6"(#loc20)) +#loc160 = loc("tmp6"(#loc21)) +#loc161 = loc("tmp6"(#loc22)) +#loc162 = loc("tmp6"(#loc23)) +#loc163 = loc("tmp6"(#loc24)) +#loc164 = loc("tmp2"(#loc25)) +#loc165 = loc("tmp5"(#loc26)) +#loc166 = loc("_tmp4"(#loc27)) +#loc167 = loc("tmp8"(#loc28)) +#loc168 = loc("tmp11"(#loc29)) +#loc169 = loc("_tmp10"(#loc30)) +#loc171 = loc("tmp4"(#loc34)) +#loc173 = loc("tmp10"(#loc36)) +#loc174 = loc("r0_3"(#loc37)) +#loc175 = loc("r0_4"(#loc38)) +#loc176 = loc("tmp58"(#loc39)) +#loc177 = loc("tmp58"(#loc40)) +#loc178 = loc("tmp58"(#loc41)) +#loc179 = loc("tmp63"(#loc42)) +#loc180 = loc("tmp63"(#loc43)) +#loc181 = loc("tmp63"(#loc44)) +#loc182 = loc("tmp63"(#loc45)) +#loc183 = loc("tmp66"(#loc46)) +#loc184 = loc("tmp66"(#loc47)) +#loc185 = loc("tmp96"(#loc48)) +#loc186 = loc("tmp96"(#loc49)) +#loc187 = loc("tmp102"(#loc50)) +#loc188 = loc("tmp102"(#loc51)) +#loc189 = loc("tmp102"(#loc52)) +#loc190 = loc("tmp16"(#loc53)) +#loc191 = loc("tmp17"(#loc54)) +#loc192 = loc("tmp17"(#loc55)) +#loc193 = loc("tmp17"(#loc56)) +#loc194 = loc("tmp17"(#loc57)) +#loc195 = loc("tmp17"(#loc58)) +#loc196 = loc("tmp17"(#loc59)) +#loc197 = loc("tmp17"(#loc60)) +#loc198 = loc("tmp17"(#loc61)) +#loc199 = loc("tmp20"(#loc62)) +#loc200 = loc("tmp22"(#loc63)) +#loc201 = loc("tmp23"(#loc64)) +#loc202 = loc("tmp24"(#loc65)) +#loc203 = loc("tmp25"(#loc66)) +#loc204 = loc("tmp25"(#loc67)) +#loc205 = loc("tmp25"(#loc68)) +#loc206 = loc("tmp27"(#loc69)) +#loc207 = loc("tmp29"(#loc70)) +#loc208 = loc("tmp31"(#loc71)) +#loc209 = loc("tmp32"(#loc72)) +#loc210 = loc("tmp35"(#loc73)) +#loc211 = loc("tmp35"(#loc74)) +#loc212 = loc("tmp35"(#loc75)) +#loc213 = loc("tmp35"(#loc76)) +#loc214 = loc("tmp35"(#loc77)) +#loc215 = loc("tmp35"(#loc78)) +#loc216 = loc("tmp42"(#loc79)) +#loc217 = loc("tmp43"(#loc80)) +#loc218 = loc("tmp43"(#loc81)) +#loc219 = loc("tmp43"(#loc82)) +#loc220 = loc("tmp45"(#loc83)) +#loc221 = loc("tmp48"(#loc84)) +#loc222 = loc("tmp49"(#loc85)) +#loc223 = loc("tmp57"(#loc86)) +#loc224 = loc("tmp60"(#loc87)) +#loc225 = loc("tmp64"(#loc88)) +#loc226 = loc("tmp67"(#loc89)) +#loc227 = loc("tmp68"(#loc90)) +#loc228 = loc("tmp70"(#loc91)) +#loc229 = loc("tmp70"(#loc92)) +#loc230 = loc("tmp70"(#loc93)) +#loc231 = loc("tmp70"(#loc94)) +#loc232 = loc("tmp70"(#loc95)) +#loc233 = loc("tmp70"(#loc96)) +#loc234 = loc("tmp72"(#loc97)) +#loc235 = loc("tmp73"(#loc98)) +#loc236 = loc("tmp74"(#loc99)) +#loc237 = loc("tmp75"(#loc100)) +#loc238 = loc("tmp76"(#loc101)) +#loc239 = loc("tmp76"(#loc102)) +#loc240 = loc("tmp76"(#loc103)) +#loc241 = loc("tmp78"(#loc104)) +#loc242 = loc("tmp80"(#loc105)) +#loc243 = loc("tmp82"(#loc106)) +#loc244 = loc("tmp83"(#loc107)) +#loc245 = loc("tmp83"(#loc108)) +#loc246 = loc("tmp83"(#loc109)) +#loc247 = loc("tmp83"(#loc110)) +#loc248 = loc("tmp83"(#loc111)) +#loc249 = loc("tmp83"(#loc112)) +#loc250 = loc("tmp88"(#loc113)) +#loc251 = loc("tmp89"(#loc114)) +#loc252 = loc("tmp89"(#loc115)) +#loc253 = loc("tmp89"(#loc116)) +#loc254 = loc("tmp91"(#loc117)) +#loc255 = loc("tmp94"(#loc118)) +#loc256 = loc("tmp95"(#loc119)) +#loc257 = loc("tmp101"(#loc120)) +#loc258 = loc("tmp104"(#loc121)) +#loc259 = loc("tmp107"(#loc122)) +#loc260 = loc("tmp109"(#loc123)) +#loc261 = loc("tmp110"(#loc124)) +#loc262 = loc(callsite(#loc31 at #loc170)) +#loc264 = loc(callsite(#loc31 at #loc172)) +#loc266 = loc(callsite(#loc33 at #loc262)) +#loc267 = loc(callsite(#loc33 at #loc264)) diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/__grp__triton_poi_fused_mul_silu_split_0.json b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/__grp__triton_poi_fused_mul_silu_split_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8cb5fbe038a5186e47a489e40bb8c19b5ea19bb2 --- /dev/null +++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/__grp__triton_poi_fused_mul_silu_split_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_mul_silu_split_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.source", "triton_poi_fused_mul_silu_split_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttir", "triton_poi_fused_mul_silu_split_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttgir", "triton_poi_fused_mul_silu_split_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.llir", "triton_poi_fused_mul_silu_split_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ptx", "triton_poi_fused_mul_silu_split_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.cubin", "triton_poi_fused_mul_silu_split_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.json"}} \ No newline at end of file diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.cubin b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..3519c5b9375dd04c14d6f407d6979d21a342579a Binary files /dev/null and b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.cubin differ diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.json b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.json new file mode 100644 index 0000000000000000000000000000000000000000..02f1bc3c82364cc1aec481190a7930b0c283a089 --- /dev/null +++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.json @@ -0,0 +1 @@ +{"hash": "f0dfcefe16760783e7f30461c72de35988a92d8dad09b3695094292ac5bf286a", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_silu_split_0"} \ No newline at end of file diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.llir b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..749bdf56e71c5fab670af172c19630702b230ada --- /dev/null +++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.llir @@ -0,0 +1,176 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_mul_silu_split_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 10, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 3, !dbg !9 + %10 = and i32 %9, 1016, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = srem i32 %11, 12288, !dbg !11 + %13 = sub nsw i32 %11, %12, !dbg !11 + %14 = add i32 %13, %11, !dbg !11 + %15 = sext i32 %14 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12 + %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %16) #3, !dbg !13 + %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !13 + %19 = bitcast i32 %18 to <2 x bfloat>, !dbg !13 + %20 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !13 + %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !13 + %22 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !13 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !13 + %24 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !13 + %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !13 + %26 = add i32 %14, 12288, !dbg !14 + %27 = sext i32 %26 to i64, !dbg !15 + %28 = getelementptr bfloat, ptr addrspace(1) %0, i64 %27, !dbg !15 + %29 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %28) #3, !dbg !16 + %30 = extractvalue { i32, i32, i32, i32 } %29, 0, !dbg !16 + %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !16 + %32 = extractvalue { i32, i32, i32, i32 } %29, 1, !dbg !16 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !16 + %34 = extractvalue { i32, i32, i32, i32 } %29, 2, !dbg !16 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !16 + %36 = extractvalue { i32, i32, i32, i32 } %29, 3, !dbg !16 + %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !16 + %38 = sext i32 %11 to i64, !dbg !17 + %39 = getelementptr bfloat, ptr addrspace(1) %1, i64 %38, !dbg !17 + %40 = fpext <2 x bfloat> %19 to <2 x float>, !dbg !18 + %41 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !19 + %42 = extractelement <2 x float> %40, i64 0, !dbg !20 + %43 = fsub float 0.000000e+00, %42, !dbg !20 + %44 = extractelement <2 x float> %40, i64 1, !dbg !20 + %45 = fsub float 0.000000e+00, %44, !dbg !20 + %46 = fmul float %43, 0x3FF7154760000000, !dbg !25 + %47 = tail call float @llvm.nvvm.ex2.approx.f(float %46), !dbg !25 + %48 = fmul float %45, 0x3FF7154760000000, !dbg !25 + %49 = tail call float @llvm.nvvm.ex2.approx.f(float %48), !dbg !25 + %50 = fadd float %47, 1.000000e+00, !dbg !26 + %51 = fadd float %49, 1.000000e+00, !dbg !26 + %52 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %50), !dbg !27 + %53 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %51), !dbg !27 + %54 = insertelement <2 x float> poison, float %52, i64 0, !dbg !28 + %55 = insertelement <2 x float> %54, float %53, i64 1, !dbg !28 + %56 = fmul <2 x float> %55, %40, !dbg !28 + %57 = fmul <2 x float> %56, %41, !dbg !29 + %58 = fptrunc <2 x float> %57 to <2 x bfloat>, !dbg !30 + %59 = fpext <2 x bfloat> %21 to <2 x float>, !dbg !18 + %60 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !19 + %61 = extractelement <2 x float> %59, i64 0, !dbg !20 + %62 = fsub float 0.000000e+00, %61, !dbg !20 + %63 = extractelement <2 x float> %59, i64 1, !dbg !20 + %64 = fsub float 0.000000e+00, %63, !dbg !20 + %65 = fmul float %62, 0x3FF7154760000000, !dbg !25 + %66 = tail call float @llvm.nvvm.ex2.approx.f(float %65), !dbg !25 + %67 = fmul float %64, 0x3FF7154760000000, !dbg !25 + %68 = tail call float @llvm.nvvm.ex2.approx.f(float %67), !dbg !25 + %69 = fadd float %66, 1.000000e+00, !dbg !26 + %70 = fadd float %68, 1.000000e+00, !dbg !26 + %71 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %69), !dbg !27 + %72 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %70), !dbg !27 + %73 = insertelement <2 x float> poison, float %71, i64 0, !dbg !28 + %74 = insertelement <2 x float> %73, float %72, i64 1, !dbg !28 + %75 = fmul <2 x float> %74, %59, !dbg !28 + %76 = fmul <2 x float> %75, %60, !dbg !29 + %77 = fptrunc <2 x float> %76 to <2 x bfloat>, !dbg !30 + %78 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !18 + %79 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !19 + %80 = extractelement <2 x float> %78, i64 0, !dbg !20 + %81 = fsub float 0.000000e+00, %80, !dbg !20 + %82 = extractelement <2 x float> %78, i64 1, !dbg !20 + %83 = fsub float 0.000000e+00, %82, !dbg !20 + %84 = fmul float %81, 0x3FF7154760000000, !dbg !25 + %85 = tail call float @llvm.nvvm.ex2.approx.f(float %84), !dbg !25 + %86 = fmul float %83, 0x3FF7154760000000, !dbg !25 + %87 = tail call float @llvm.nvvm.ex2.approx.f(float %86), !dbg !25 + %88 = fadd float %85, 1.000000e+00, !dbg !26 + %89 = fadd float %87, 1.000000e+00, !dbg !26 + %90 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %88), !dbg !27 + %91 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %89), !dbg !27 + %92 = insertelement <2 x float> poison, float %90, i64 0, !dbg !28 + %93 = insertelement <2 x float> %92, float %91, i64 1, !dbg !28 + %94 = fmul <2 x float> %93, %78, !dbg !28 + %95 = fmul <2 x float> %94, %79, !dbg !29 + %96 = fptrunc <2 x float> %95 to <2 x bfloat>, !dbg !30 + %97 = fpext <2 x bfloat> %25 to <2 x float>, !dbg !18 + %98 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !19 + %99 = extractelement <2 x float> %97, i64 0, !dbg !20 + %100 = fsub float 0.000000e+00, %99, !dbg !20 + %101 = extractelement <2 x float> %97, i64 1, !dbg !20 + %102 = fsub float 0.000000e+00, %101, !dbg !20 + %103 = fmul float %100, 0x3FF7154760000000, !dbg !25 + %104 = tail call float @llvm.nvvm.ex2.approx.f(float %103), !dbg !25 + %105 = fmul float %102, 0x3FF7154760000000, !dbg !25 + %106 = tail call float @llvm.nvvm.ex2.approx.f(float %105), !dbg !25 + %107 = fadd float %104, 1.000000e+00, !dbg !26 + %108 = fadd float %106, 1.000000e+00, !dbg !26 + %109 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %107), !dbg !27 + %110 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %108), !dbg !27 + %111 = insertelement <2 x float> poison, float %109, i64 0, !dbg !28 + %112 = insertelement <2 x float> %111, float %110, i64 1, !dbg !28 + %113 = fmul <2 x float> %112, %97, !dbg !28 + %114 = fmul <2 x float> %113, %98, !dbg !29 + %115 = fptrunc <2 x float> %114 to <2 x bfloat>, !dbg !30 + %116 = bitcast <2 x bfloat> %58 to i32, !dbg !30 + %117 = bitcast <2 x bfloat> %77 to i32, !dbg !30 + %118 = bitcast <2 x bfloat> %96 to i32, !dbg !30 + %119 = bitcast <2 x bfloat> %115 to i32, !dbg !30 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %116, i32 %117, i32 %118, i32 %119, ptr addrspace(1) %39) #3, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_silu_split_0", linkageName: "triton_poi_fused_mul_silu_split_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 35, scope: !4) +!12 = !DILocation(line: 26, column: 30, scope: !4) +!13 = !DILocation(line: 26, column: 46, scope: !4) +!14 = !DILocation(line: 27, column: 43, scope: !4) +!15 = !DILocation(line: 27, column: 30, scope: !4) +!16 = !DILocation(line: 27, column: 54, scope: !4) +!17 = !DILocation(line: 33, column: 25, scope: !4) +!18 = !DILocation(line: 26, column: 55, scope: !4) +!19 = !DILocation(line: 27, column: 63, scope: !4) +!20 = !DILocation(line: 50, column: 30, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !4, file: !22, discriminator: 0) +!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!23 = !DILocation(line: 29, column: 22, scope: !24) +!24 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!25 = !DILocation(line: 50, column: 29, scope: !21, inlinedAt: !23) +!26 = !DILocation(line: 50, column: 20, scope: !21, inlinedAt: !23) +!27 = !DILocation(line: 50, column: 16, scope: !21, inlinedAt: !23) +!28 = !DILocation(line: 30, column: 18, scope: !4) +!29 = !DILocation(line: 32, column: 18, scope: !4) +!30 = !DILocation(line: 33, column: 36, scope: !4) +!31 = !DILocation(line: 33, column: 4, scope: !4) diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ptx b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..05aaa9b7dd33c30603eeeee1bfce35c5cc1814f1 --- /dev/null +++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ptx @@ -0,0 +1,539 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_mul_silu_split_0 // -- Begin function triton_poi_fused_mul_silu_split_0 + // @triton_poi_fused_mul_silu_split_0 +.visible .entry triton_poi_fused_mul_silu_split_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_1, + .param .u32 triton_poi_fused_mul_silu_split_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_4 +) +.reqntid 128 +{ + .reg .b16 %rs<17>; + .reg .b32 %r<99>; + .reg .b64 %rd<6>; + .loc 1 18 0 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_poi_fused_mul_silu_split_0_param_0]; + ld.param.b64 %rd5, [triton_poi_fused_mul_silu_split_0_param_1]; +$L__tmp0: + .loc 1 20 28 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:20:28 + mov.u32 %r13, %ctaid.x; + .loc 1 20 33 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:20:33 + shl.b32 %r14, %r13, 10; + .loc 1 21 36 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:21:36 + mov.u32 %r15, %tid.x; + shl.b32 %r16, %r15, 3; + and.b32 %r17, %r16, 1016; + .loc 1 21 23 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:21:23 + or.b32 %r18, %r17, %r14; + .loc 1 26 35 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:35 + mul.hi.s32 %r19, %r18, 715827883; + shr.u32 %r20, %r19, 31; + shr.u32 %r21, %r19, 11; + add.s32 %r22, %r21, %r20; + mad.lo.s32 %r23, %r22, 12288, %r18; + .loc 1 26 30 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:30 + mad.wide.s32 %rd1, %r23, 2, %rd4; + .loc 1 26 46 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:46 + // begin inline asm + mov.u32 %r1, 0x0; + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 27 43 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:43 + add.s32 %r24, %r23, 12288; + .loc 1 27 30 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:30 + mad.wide.s32 %rd2, %r24, 2, %rd4; + .loc 1 27 54 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:54 + // begin inline asm + mov.u32 %r5, 0x0; + mov.u32 %r6, 0x0; + mov.u32 %r7, 0x0; + mov.u32 %r8, 0x0; + ld.global.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ]; + // end inline asm + .loc 1 33 25 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:25 + mad.wide.s32 %rd3, %r18, 2, %rd5; + .loc 1 26 55 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:55 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r25, %rs2; + cvt.f32.bf16 %r26, %rs1; + .loc 1 27 63 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:63 + mov.b32 {%rs3, %rs4}, %r5; + cvt.f32.bf16 %r27, %rs4; + cvt.f32.bf16 %r28, %rs3; + mov.b32 %r29, 0f00000000; +$L__tmp1: + .loc 2 50 30 // standard.py:50:30 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + sub.f32 %r30, %r29, %r26; + sub.f32 %r31, %r29, %r25; + .loc 2 50 29 // standard.py:50:29 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + mul.f32 %r32, %r30, 0f3FB8AA3B; + ex2.approx.f32 %r33, %r32; + mul.f32 %r34, %r31, 0f3FB8AA3B; + ex2.approx.f32 %r35, %r34; + .loc 2 50 20 // standard.py:50:20 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + add.f32 %r36, %r33, 0f3F800000; + add.f32 %r37, %r35, 0f3F800000; + mov.b32 %r38, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + div.full.f32 %r39, %r38, %r36; + div.full.f32 %r40, %r38, %r37; +$L__tmp2: + .loc 1 30 18 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:30:18 + mul.f32 %r41, %r40, %r25; + mul.f32 %r42, %r39, %r26; + .loc 1 32 18 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:32:18 + mul.f32 %r43, %r42, %r28; + mul.f32 %r44, %r41, %r27; + .loc 1 33 36 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:36 + cvt.rn.bf16x2.f32 %r9, %r44, %r43; + .loc 1 26 55 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:55 + mov.b32 {%rs5, %rs6}, %r2; + cvt.f32.bf16 %r45, %rs6; + cvt.f32.bf16 %r46, %rs5; + .loc 1 27 63 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:63 + mov.b32 {%rs7, %rs8}, %r6; + cvt.f32.bf16 %r47, %rs8; + cvt.f32.bf16 %r48, %rs7; +$L__tmp3: + .loc 2 50 30 // standard.py:50:30 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + sub.f32 %r49, %r29, %r46; + sub.f32 %r50, %r29, %r45; + .loc 2 50 29 // standard.py:50:29 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + mul.f32 %r51, %r49, 0f3FB8AA3B; + ex2.approx.f32 %r52, %r51; + mul.f32 %r53, %r50, 0f3FB8AA3B; + ex2.approx.f32 %r54, %r53; + .loc 2 50 20 // standard.py:50:20 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + add.f32 %r55, %r52, 0f3F800000; + add.f32 %r56, %r54, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + div.full.f32 %r57, %r38, %r55; + div.full.f32 %r58, %r38, %r56; +$L__tmp4: + .loc 1 30 18 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:30:18 + mul.f32 %r59, %r58, %r45; + mul.f32 %r60, %r57, %r46; + .loc 1 32 18 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:32:18 + mul.f32 %r61, %r60, %r48; + mul.f32 %r62, %r59, %r47; + .loc 1 33 36 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:36 + cvt.rn.bf16x2.f32 %r10, %r62, %r61; + .loc 1 26 55 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:55 + mov.b32 {%rs9, %rs10}, %r3; + cvt.f32.bf16 %r63, %rs10; + cvt.f32.bf16 %r64, %rs9; + .loc 1 27 63 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:63 + mov.b32 {%rs11, %rs12}, %r7; + cvt.f32.bf16 %r65, %rs12; + cvt.f32.bf16 %r66, %rs11; +$L__tmp5: + .loc 2 50 30 // standard.py:50:30 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + sub.f32 %r67, %r29, %r64; + sub.f32 %r68, %r29, %r63; + .loc 2 50 29 // standard.py:50:29 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + mul.f32 %r69, %r67, 0f3FB8AA3B; + ex2.approx.f32 %r70, %r69; + mul.f32 %r71, %r68, 0f3FB8AA3B; + ex2.approx.f32 %r72, %r71; + .loc 2 50 20 // standard.py:50:20 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + add.f32 %r73, %r70, 0f3F800000; + add.f32 %r74, %r72, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + div.full.f32 %r75, %r38, %r73; + div.full.f32 %r76, %r38, %r74; +$L__tmp6: + .loc 1 30 18 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:30:18 + mul.f32 %r77, %r76, %r63; + mul.f32 %r78, %r75, %r64; + .loc 1 32 18 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:32:18 + mul.f32 %r79, %r78, %r66; + mul.f32 %r80, %r77, %r65; + .loc 1 33 36 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:36 + cvt.rn.bf16x2.f32 %r11, %r80, %r79; + .loc 1 26 55 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:55 + mov.b32 {%rs13, %rs14}, %r4; + cvt.f32.bf16 %r81, %rs14; + cvt.f32.bf16 %r82, %rs13; + .loc 1 27 63 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:63 + mov.b32 {%rs15, %rs16}, %r8; + cvt.f32.bf16 %r83, %rs16; + cvt.f32.bf16 %r84, %rs15; +$L__tmp7: + .loc 2 50 30 // standard.py:50:30 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + sub.f32 %r85, %r29, %r82; + sub.f32 %r86, %r29, %r81; + .loc 2 50 29 // standard.py:50:29 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + mul.f32 %r87, %r85, 0f3FB8AA3B; + ex2.approx.f32 %r88, %r87; + mul.f32 %r89, %r86, 0f3FB8AA3B; + ex2.approx.f32 %r90, %r89; + .loc 2 50 20 // standard.py:50:20 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + add.f32 %r91, %r88, 0f3F800000; + add.f32 %r92, %r90, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + div.full.f32 %r93, %r38, %r91; + div.full.f32 %r94, %r38, %r92; +$L__tmp8: + .loc 1 30 18 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:30:18 + mul.f32 %r95, %r94, %r81; + mul.f32 %r96, %r93, %r82; + .loc 1 32 18 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:32:18 + mul.f32 %r97, %r96, %r84; + mul.f32 %r98, %r95, %r83; + .loc 1 33 36 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:36 + cvt.rn.bf16x2.f32 %r12, %r98, %r97; + // begin inline asm + st.global.v4.b32 [ %rd3 + 0 ], { %r9, %r10, %r11, %r12 }; + // end inline asm + .loc 1 33 4 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 307 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 119 +.b8 54 +.b8 115 +.b8 103 +.b8 52 +.b8 118 +.b8 51 +.b8 98 +.b8 99 +.b8 105 +.b8 103 +.b8 104 +.b8 119 +.b8 111 +.b8 107 +.b8 122 +.b8 113 +.b8 54 +.b8 105 +.b8 52 +.b8 51 +.b8 116 +.b8 108 +.b8 53 +.b8 120 +.b8 107 +.b8 53 +.b8 118 +.b8 122 +.b8 55 +.b8 122 +.b8 101 +.b8 118 +.b8 117 +.b8 107 +.b8 55 +.b8 106 +.b8 104 +.b8 118 +.b8 108 +.b8 113 +.b8 121 +.b8 114 +.b8 121 +.b8 121 +.b8 117 +.b8 104 +.b8 117 +.b8 101 +.b8 111 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 54 +.b8 119 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x24 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 111 +.b8 105 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 105 +.b8 108 +.b8 117 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 29 // DW_AT_call_line +.b8 22 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.source b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.source new file mode 100644 index 0000000000000000000000000000000000000000..11c465fdf51c6ae2895be0136afd18f90cc2fb5a --- /dev/null +++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.source @@ -0,0 +1,129 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0) +#loc33 = loc("in_ptr0"(#loc)) +#loc34 = loc("out_ptr0"(#loc)) +#loc35 = loc("xnumel"(#loc)) +#loc58 = loc("x"(#loc26)) +module { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 25165824 : i32 loc(#loc36) + %xoffset = tt.get_program_id x : i32 loc(#loc37) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc38) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc38) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc38) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc39) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc40) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc40) + %xmask = arith.constant true loc(#loc41) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc41) + %x0 = arith.constant 12288 : i32 loc(#loc42) + %x0_7 = arith.constant 12288 : i32 loc(#loc42) + %x0_8 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc42) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc42) + %x1 = arith.constant 12288 : i32 loc(#loc43) + %x1_10 = arith.constant 12288 : i32 loc(#loc43) + %x1_11 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc43) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc43) + %tmp0 = arith.constant 24576 : i32 loc(#loc44) + %tmp0_13 = arith.constant 24576 : i32 loc(#loc44) + %tmp0_14 = arith.constant dense<24576> : tensor<1024xi32> loc(#loc44) + %tmp0_15 = arith.muli %tmp0_14, %x1_12 : tensor<1024xi32> loc(#loc44) + %tmp0_16 = arith.addi %x0_9, %tmp0_15 : tensor<1024xi32> loc(#loc45) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc46) + %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc46) + %tmp0_19 = tt.load %tmp0_18 : tensor<1024x!tt.ptr> loc(#loc47) + %tmp0_20 = arith.extf %tmp0_19 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc48) + %tmp5 = arith.constant 12288 : i32 loc(#loc49) + %tmp5_21 = arith.constant 12288 : i32 loc(#loc49) + %tmp5_22 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc49) + %tmp5_23 = arith.addi %tmp5_22, %x0_9 : tensor<1024xi32> loc(#loc49) + %tmp5_24 = arith.constant 24576 : i32 loc(#loc50) + %tmp5_25 = arith.constant 24576 : i32 loc(#loc50) + %tmp5_26 = arith.constant dense<24576> : tensor<1024xi32> loc(#loc50) + %tmp5_27 = arith.muli %tmp5_26, %x1_12 : tensor<1024xi32> loc(#loc50) + %tmp5_28 = arith.addi %tmp5_23, %tmp5_27 : tensor<1024xi32> loc(#loc51) + %tmp5_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc52) + %tmp5_30 = tt.addptr %tmp5_29, %tmp5_28 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc52) + %tmp5_31 = tt.load %tmp5_30 : tensor<1024x!tt.ptr> loc(#loc53) + %tmp5_32 = arith.extf %tmp5_31 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc54) + %tmp2 = tt.call @triton.language.standard.sigmoid__fp32S1024S__(%tmp0_20) : (tensor<1024xf32>) -> tensor<1024xf32> loc(#loc55) + %tmp3 = arith.mulf %tmp0_20, %tmp2 : tensor<1024xf32> loc(#loc56) + %tmp6 = arith.mulf %tmp3, %tmp5_32 : tensor<1024xf32> loc(#loc57) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc23) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc23) + %2 = arith.truncf %tmp6 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc24) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc24) + tt.return loc(#loc25) + } loc(#loc) + tt.func private @triton.language.standard.sigmoid__fp32S1024S__(%x: tensor<1024xf32> loc("x"(#loc26))) -> tensor<1024xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc27) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc27) + %0 = arith.subf %cst_0, %x : tensor<1024xf32> loc(#loc27) + %1 = math.exp %0 : tensor<1024xf32> loc(#loc28) + %c1_i32 = arith.constant 1 : i32 loc(#loc29) + %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc29) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc29) + %2 = arith.addf %cst_2, %1 : tensor<1024xf32> loc(#loc29) + %c1_i32_3 = arith.constant 1 : i32 loc(#loc30) + %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc30) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc30) + %3 = arith.divf %cst_5, %2 : tensor<1024xf32> loc(#loc30) + tt.return %3 : tensor<1024xf32> loc(#loc31) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1024xf32> loc(#loc32) + tt.return %4 : tensor<1024xf32> loc(#loc32) + } loc(#loc26) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:49) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4) +#loc36 = loc("xnumel"(#loc1)) +#loc37 = loc("xoffset"(#loc2)) +#loc38 = loc("xoffset"(#loc3)) +#loc39 = loc("xindex"(#loc4)) +#loc40 = loc("xindex"(#loc5)) +#loc41 = loc("xmask"(#loc6)) +#loc42 = loc("x0"(#loc7)) +#loc43 = loc("x1"(#loc8)) +#loc44 = loc("tmp0"(#loc9)) +#loc45 = loc("tmp0"(#loc10)) +#loc46 = loc("tmp0"(#loc11)) +#loc47 = loc("tmp0"(#loc12)) +#loc48 = loc("tmp0"(#loc13)) +#loc49 = loc("tmp5"(#loc14)) +#loc50 = loc("tmp5"(#loc15)) +#loc51 = loc("tmp5"(#loc16)) +#loc52 = loc("tmp5"(#loc17)) +#loc53 = loc("tmp5"(#loc18)) +#loc54 = loc("tmp5"(#loc19)) +#loc55 = loc("tmp2"(#loc20)) +#loc56 = loc("tmp3"(#loc21)) +#loc57 = loc("tmp6"(#loc22)) diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttgir b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2644d72632d6b667223b47ab6251c7daf35bc79d --- /dev/null +++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttgir @@ -0,0 +1,93 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0) +#loc28 = loc("in_ptr0"(#loc)) +#loc29 = loc("out_ptr0"(#loc)) +#loc30 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<24576> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<12288> : tensor<1024xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc31) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc32) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc33) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc34) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc34) + %x0 = arith.remsi %xindex_5, %cst_0 : tensor<1024xi32, #blocked> loc(#loc35) + %x1 = arith.divsi %xindex_5, %cst_0 : tensor<1024xi32, #blocked> loc(#loc36) + %tmp0 = arith.muli %x1, %cst : tensor<1024xi32, #blocked> loc(#loc37) + %tmp0_6 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc38) + %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc39) + %tmp0_8 = tt.addptr %tmp0_7, %tmp0_6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc39) + %tmp0_9 = tt.load %tmp0_8 : tensor<1024x!tt.ptr, #blocked> loc(#loc40) + %tmp0_10 = arith.extf %tmp0_9 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc41) + %tmp5 = arith.addi %x0, %cst_0 : tensor<1024xi32, #blocked> loc(#loc42) + %tmp5_11 = arith.addi %tmp5, %tmp0 : tensor<1024xi32, #blocked> loc(#loc43) + %tmp5_12 = tt.addptr %tmp0_7, %tmp5_11 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc44) + %tmp5_13 = tt.load %tmp5_12 : tensor<1024x!tt.ptr, #blocked> loc(#loc45) + %tmp5_14 = arith.extf %tmp5_13 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc46) + %tmp2 = arith.subf %cst_1, %tmp0_10 : tensor<1024xf32, #blocked> loc(#loc50) + %tmp2_15 = math.exp %tmp2 : tensor<1024xf32, #blocked> loc(#loc51) + %tmp2_16 = arith.addf %tmp2_15, %cst_2 : tensor<1024xf32, #blocked> loc(#loc52) + %tmp2_17 = arith.divf %cst_2, %tmp2_16 : tensor<1024xf32, #blocked> loc(#loc53) + %tmp3 = arith.mulf %tmp0_10, %tmp2_17 : tensor<1024xf32, #blocked> loc(#loc48) + %tmp6 = arith.mulf %tmp3, %tmp5_14 : tensor<1024xf32, #blocked> loc(#loc49) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc25) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc25) + %2 = arith.truncf %tmp6 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc26) + tt.store %1, %2 : tensor<1024x!tt.ptr, #blocked> loc(#loc26) + tt.return loc(#loc27) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4) +#loc31 = loc("xoffset"(#loc2)) +#loc32 = loc("xoffset"(#loc3)) +#loc33 = loc("xindex"(#loc4)) +#loc34 = loc("xindex"(#loc5)) +#loc35 = loc("x0"(#loc6)) +#loc36 = loc("x1"(#loc7)) +#loc37 = loc("tmp0"(#loc8)) +#loc38 = loc("tmp0"(#loc9)) +#loc39 = loc("tmp0"(#loc10)) +#loc40 = loc("tmp0"(#loc11)) +#loc41 = loc("tmp0"(#loc12)) +#loc42 = loc("tmp5"(#loc13)) +#loc43 = loc("tmp5"(#loc14)) +#loc44 = loc("tmp5"(#loc15)) +#loc45 = loc("tmp5"(#loc16)) +#loc46 = loc("tmp5"(#loc17)) +#loc47 = loc("tmp2"(#loc19)) +#loc48 = loc("tmp3"(#loc23)) +#loc49 = loc("tmp6"(#loc24)) +#loc50 = loc(callsite(#loc18 at #loc47)) +#loc51 = loc(callsite(#loc20 at #loc47)) +#loc52 = loc(callsite(#loc21 at #loc47)) +#loc53 = loc(callsite(#loc22 at #loc47)) diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttir b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..9a61f0406e185eab580c3b19e50d7d4bc20a07c5 --- /dev/null +++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttir @@ -0,0 +1,93 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0) +#loc28 = loc("in_ptr0"(#loc)) +#loc29 = loc("out_ptr0"(#loc)) +#loc30 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp2 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc50) + %tmp2_0 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc51) + %cst = arith.constant dense<24576> : tensor<1024xi32> loc(#loc3) + %cst_1 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc3) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc32) + %xoffset_2 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc33) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc34) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32> loc(#loc35) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32> loc(#loc35) + %x0 = arith.remsi %xindex_4, %cst_1 : tensor<1024xi32> loc(#loc36) + %x1 = arith.divsi %xindex_4, %cst_1 : tensor<1024xi32> loc(#loc37) + %tmp0 = arith.muli %x1, %cst : tensor<1024xi32> loc(#loc38) + %tmp0_5 = arith.addi %x0, %tmp0 : tensor<1024xi32> loc(#loc39) + %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc40) + %tmp0_7 = tt.addptr %tmp0_6, %tmp0_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc40) + %tmp0_8 = tt.load %tmp0_7 : tensor<1024x!tt.ptr> loc(#loc41) + %tmp0_9 = arith.extf %tmp0_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc42) + %tmp5 = arith.addi %x0, %cst_1 : tensor<1024xi32> loc(#loc43) + %tmp5_10 = arith.addi %tmp5, %tmp0 : tensor<1024xi32> loc(#loc44) + %tmp5_11 = tt.addptr %tmp0_6, %tmp5_10 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc45) + %tmp5_12 = tt.load %tmp5_11 : tensor<1024x!tt.ptr> loc(#loc46) + %tmp5_13 = arith.extf %tmp5_12 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc47) + %tmp2_14 = arith.subf %tmp2, %tmp0_9 : tensor<1024xf32> loc(#loc50) + %tmp2_15 = math.exp %tmp2_14 : tensor<1024xf32> loc(#loc52) + %tmp2_16 = arith.addf %tmp2_15, %tmp2_0 : tensor<1024xf32> loc(#loc53) + %tmp2_17 = arith.divf %tmp2_0, %tmp2_16 : tensor<1024xf32> loc(#loc54) + %tmp3 = arith.mulf %tmp0_9, %tmp2_17 : tensor<1024xf32> loc(#loc48) + %tmp6 = arith.mulf %tmp3, %tmp5_13 : tensor<1024xf32> loc(#loc49) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc25) + %1 = tt.addptr %0, %xindex_4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc25) + %2 = arith.truncf %tmp6 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc26) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc26) + tt.return loc(#loc27) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22) +#loc3 = loc(unknown) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4) +#loc31 = loc("tmp2"(#loc2)) +#loc32 = loc("xoffset"(#loc4)) +#loc33 = loc("xoffset"(#loc5)) +#loc34 = loc("xindex"(#loc6)) +#loc35 = loc("xindex"(#loc7)) +#loc36 = loc("x0"(#loc8)) +#loc37 = loc("x1"(#loc9)) +#loc38 = loc("tmp0"(#loc10)) +#loc39 = loc("tmp0"(#loc11)) +#loc40 = loc("tmp0"(#loc12)) +#loc41 = loc("tmp0"(#loc13)) +#loc42 = loc("tmp0"(#loc14)) +#loc43 = loc("tmp5"(#loc15)) +#loc44 = loc("tmp5"(#loc16)) +#loc45 = loc("tmp5"(#loc17)) +#loc46 = loc("tmp5"(#loc18)) +#loc47 = loc("tmp5"(#loc19)) +#loc48 = loc("tmp3"(#loc23)) +#loc49 = loc("tmp6"(#loc24)) +#loc50 = loc(callsite(#loc1 at #loc31)) +#loc51 = loc(callsite(#loc3 at #loc31)) +#loc52 = loc(callsite(#loc20 at #loc31)) +#loc53 = loc(callsite(#loc21 at #loc31)) +#loc54 = loc(callsite(#loc22 at #loc31)) diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/__grp__triton_poi_fused_mul_silu_split_0.json b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/__grp__triton_poi_fused_mul_silu_split_0.json new file mode 100644 index 0000000000000000000000000000000000000000..aa37e98aba3afb74f4527b0c87f14a4aa1cb622d --- /dev/null +++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/__grp__triton_poi_fused_mul_silu_split_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_mul_silu_split_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.source", "triton_poi_fused_mul_silu_split_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttir", "triton_poi_fused_mul_silu_split_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttgir", "triton_poi_fused_mul_silu_split_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.llir", "triton_poi_fused_mul_silu_split_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ptx", "triton_poi_fused_mul_silu_split_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.cubin", "triton_poi_fused_mul_silu_split_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.json"}} \ No newline at end of file diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.cubin b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..93018b71b547598981c1b3c0d22570bc5a309414 Binary files /dev/null and b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.cubin differ diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.json b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e8ba81b358700e8812c5e4f3be3c75adde0719cf --- /dev/null +++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.json @@ -0,0 +1 @@ +{"hash": "ffcabc751b8432e59ff4835a1a069005288ea2d2099d0cf63bc686e250b64600", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_silu_split_0"} \ No newline at end of file diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.llir b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..5cad5920af6565e198f3dc793f6a8cddac09e232 --- /dev/null +++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.llir @@ -0,0 +1,176 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_mul_silu_split_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 10, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 3, !dbg !9 + %10 = and i32 %9, 1016, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = srem i32 %11, 12288, !dbg !11 + %13 = sub nsw i32 %11, %12, !dbg !11 + %14 = add i32 %13, %11, !dbg !11 + %15 = sext i32 %14 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12 + %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %16) #3, !dbg !13 + %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !13 + %19 = bitcast i32 %18 to <2 x bfloat>, !dbg !13 + %20 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !13 + %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !13 + %22 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !13 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !13 + %24 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !13 + %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !13 + %26 = add i32 %14, 12288, !dbg !14 + %27 = sext i32 %26 to i64, !dbg !15 + %28 = getelementptr bfloat, ptr addrspace(1) %0, i64 %27, !dbg !15 + %29 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %28) #3, !dbg !16 + %30 = extractvalue { i32, i32, i32, i32 } %29, 0, !dbg !16 + %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !16 + %32 = extractvalue { i32, i32, i32, i32 } %29, 1, !dbg !16 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !16 + %34 = extractvalue { i32, i32, i32, i32 } %29, 2, !dbg !16 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !16 + %36 = extractvalue { i32, i32, i32, i32 } %29, 3, !dbg !16 + %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !16 + %38 = sext i32 %11 to i64, !dbg !17 + %39 = getelementptr bfloat, ptr addrspace(1) %1, i64 %38, !dbg !17 + %40 = fpext <2 x bfloat> %19 to <2 x float>, !dbg !18 + %41 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !19 + %42 = extractelement <2 x float> %40, i64 0, !dbg !20 + %43 = fsub float 0.000000e+00, %42, !dbg !20 + %44 = extractelement <2 x float> %40, i64 1, !dbg !20 + %45 = fsub float 0.000000e+00, %44, !dbg !20 + %46 = fmul float %43, 0x3FF7154760000000, !dbg !25 + %47 = tail call float @llvm.nvvm.ex2.approx.f(float %46), !dbg !25 + %48 = fmul float %45, 0x3FF7154760000000, !dbg !25 + %49 = tail call float @llvm.nvvm.ex2.approx.f(float %48), !dbg !25 + %50 = fadd float %47, 1.000000e+00, !dbg !26 + %51 = fadd float %49, 1.000000e+00, !dbg !26 + %52 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %50), !dbg !27 + %53 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %51), !dbg !27 + %54 = insertelement <2 x float> poison, float %52, i64 0, !dbg !28 + %55 = insertelement <2 x float> %54, float %53, i64 1, !dbg !28 + %56 = fmul <2 x float> %55, %40, !dbg !28 + %57 = fmul <2 x float> %56, %41, !dbg !29 + %58 = fptrunc <2 x float> %57 to <2 x bfloat>, !dbg !30 + %59 = fpext <2 x bfloat> %21 to <2 x float>, !dbg !18 + %60 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !19 + %61 = extractelement <2 x float> %59, i64 0, !dbg !20 + %62 = fsub float 0.000000e+00, %61, !dbg !20 + %63 = extractelement <2 x float> %59, i64 1, !dbg !20 + %64 = fsub float 0.000000e+00, %63, !dbg !20 + %65 = fmul float %62, 0x3FF7154760000000, !dbg !25 + %66 = tail call float @llvm.nvvm.ex2.approx.f(float %65), !dbg !25 + %67 = fmul float %64, 0x3FF7154760000000, !dbg !25 + %68 = tail call float @llvm.nvvm.ex2.approx.f(float %67), !dbg !25 + %69 = fadd float %66, 1.000000e+00, !dbg !26 + %70 = fadd float %68, 1.000000e+00, !dbg !26 + %71 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %69), !dbg !27 + %72 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %70), !dbg !27 + %73 = insertelement <2 x float> poison, float %71, i64 0, !dbg !28 + %74 = insertelement <2 x float> %73, float %72, i64 1, !dbg !28 + %75 = fmul <2 x float> %74, %59, !dbg !28 + %76 = fmul <2 x float> %75, %60, !dbg !29 + %77 = fptrunc <2 x float> %76 to <2 x bfloat>, !dbg !30 + %78 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !18 + %79 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !19 + %80 = extractelement <2 x float> %78, i64 0, !dbg !20 + %81 = fsub float 0.000000e+00, %80, !dbg !20 + %82 = extractelement <2 x float> %78, i64 1, !dbg !20 + %83 = fsub float 0.000000e+00, %82, !dbg !20 + %84 = fmul float %81, 0x3FF7154760000000, !dbg !25 + %85 = tail call float @llvm.nvvm.ex2.approx.f(float %84), !dbg !25 + %86 = fmul float %83, 0x3FF7154760000000, !dbg !25 + %87 = tail call float @llvm.nvvm.ex2.approx.f(float %86), !dbg !25 + %88 = fadd float %85, 1.000000e+00, !dbg !26 + %89 = fadd float %87, 1.000000e+00, !dbg !26 + %90 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %88), !dbg !27 + %91 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %89), !dbg !27 + %92 = insertelement <2 x float> poison, float %90, i64 0, !dbg !28 + %93 = insertelement <2 x float> %92, float %91, i64 1, !dbg !28 + %94 = fmul <2 x float> %93, %78, !dbg !28 + %95 = fmul <2 x float> %94, %79, !dbg !29 + %96 = fptrunc <2 x float> %95 to <2 x bfloat>, !dbg !30 + %97 = fpext <2 x bfloat> %25 to <2 x float>, !dbg !18 + %98 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !19 + %99 = extractelement <2 x float> %97, i64 0, !dbg !20 + %100 = fsub float 0.000000e+00, %99, !dbg !20 + %101 = extractelement <2 x float> %97, i64 1, !dbg !20 + %102 = fsub float 0.000000e+00, %101, !dbg !20 + %103 = fmul float %100, 0x3FF7154760000000, !dbg !25 + %104 = tail call float @llvm.nvvm.ex2.approx.f(float %103), !dbg !25 + %105 = fmul float %102, 0x3FF7154760000000, !dbg !25 + %106 = tail call float @llvm.nvvm.ex2.approx.f(float %105), !dbg !25 + %107 = fadd float %104, 1.000000e+00, !dbg !26 + %108 = fadd float %106, 1.000000e+00, !dbg !26 + %109 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %107), !dbg !27 + %110 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %108), !dbg !27 + %111 = insertelement <2 x float> poison, float %109, i64 0, !dbg !28 + %112 = insertelement <2 x float> %111, float %110, i64 1, !dbg !28 + %113 = fmul <2 x float> %112, %97, !dbg !28 + %114 = fmul <2 x float> %113, %98, !dbg !29 + %115 = fptrunc <2 x float> %114 to <2 x bfloat>, !dbg !30 + %116 = bitcast <2 x bfloat> %58 to i32, !dbg !30 + %117 = bitcast <2 x bfloat> %77 to i32, !dbg !30 + %118 = bitcast <2 x bfloat> %96 to i32, !dbg !30 + %119 = bitcast <2 x bfloat> %115 to i32, !dbg !30 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %116, i32 %117, i32 %118, i32 %119, ptr addrspace(1) %39) #3, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_silu_split_0", linkageName: "triton_poi_fused_mul_silu_split_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 19, column: 28, scope: !4) +!8 = !DILocation(line: 19, column: 33, scope: !4) +!9 = !DILocation(line: 20, column: 36, scope: !4) +!10 = !DILocation(line: 20, column: 23, scope: !4) +!11 = !DILocation(line: 25, column: 35, scope: !4) +!12 = !DILocation(line: 25, column: 30, scope: !4) +!13 = !DILocation(line: 25, column: 46, scope: !4) +!14 = !DILocation(line: 26, column: 43, scope: !4) +!15 = !DILocation(line: 26, column: 30, scope: !4) +!16 = !DILocation(line: 26, column: 54, scope: !4) +!17 = !DILocation(line: 32, column: 25, scope: !4) +!18 = !DILocation(line: 25, column: 55, scope: !4) +!19 = !DILocation(line: 26, column: 63, scope: !4) +!20 = !DILocation(line: 50, column: 30, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !4, file: !22, discriminator: 0) +!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!23 = !DILocation(line: 28, column: 22, scope: !24) +!24 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!25 = !DILocation(line: 50, column: 29, scope: !21, inlinedAt: !23) +!26 = !DILocation(line: 50, column: 20, scope: !21, inlinedAt: !23) +!27 = !DILocation(line: 50, column: 16, scope: !21, inlinedAt: !23) +!28 = !DILocation(line: 29, column: 18, scope: !4) +!29 = !DILocation(line: 31, column: 18, scope: !4) +!30 = !DILocation(line: 32, column: 36, scope: !4) +!31 = !DILocation(line: 32, column: 4, scope: !4) diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ptx b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..63c0ecf51590d148c248f8d1d1ae3bb18ca1f538 --- /dev/null +++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ptx @@ -0,0 +1,539 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_mul_silu_split_0 // -- Begin function triton_poi_fused_mul_silu_split_0 + // @triton_poi_fused_mul_silu_split_0 +.visible .entry triton_poi_fused_mul_silu_split_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_1, + .param .u32 triton_poi_fused_mul_silu_split_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_4 +) +.reqntid 128 +{ + .reg .b16 %rs<17>; + .reg .b32 %r<99>; + .reg .b64 %rd<6>; + .loc 1 18 0 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:18:0 +$L__func_begin0: + .loc 1 18 0 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_poi_fused_mul_silu_split_0_param_0]; + ld.param.b64 %rd5, [triton_poi_fused_mul_silu_split_0_param_1]; +$L__tmp0: + .loc 1 19 28 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:19:28 + mov.u32 %r13, %ctaid.x; + .loc 1 19 33 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:19:33 + shl.b32 %r14, %r13, 10; + .loc 1 20 36 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:20:36 + mov.u32 %r15, %tid.x; + shl.b32 %r16, %r15, 3; + and.b32 %r17, %r16, 1016; + .loc 1 20 23 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:20:23 + or.b32 %r18, %r17, %r14; + .loc 1 25 35 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:35 + mul.hi.s32 %r19, %r18, 715827883; + shr.u32 %r20, %r19, 31; + shr.u32 %r21, %r19, 11; + add.s32 %r22, %r21, %r20; + mad.lo.s32 %r23, %r22, 12288, %r18; + .loc 1 25 30 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:30 + mad.wide.s32 %rd1, %r23, 2, %rd4; + .loc 1 25 46 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:46 + // begin inline asm + mov.u32 %r1, 0x0; + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 26 43 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:43 + add.s32 %r24, %r23, 12288; + .loc 1 26 30 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:30 + mad.wide.s32 %rd2, %r24, 2, %rd4; + .loc 1 26 54 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:54 + // begin inline asm + mov.u32 %r5, 0x0; + mov.u32 %r6, 0x0; + mov.u32 %r7, 0x0; + mov.u32 %r8, 0x0; + ld.global.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ]; + // end inline asm + .loc 1 32 25 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:25 + mad.wide.s32 %rd3, %r18, 2, %rd5; + .loc 1 25 55 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:55 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r25, %rs2; + cvt.f32.bf16 %r26, %rs1; + .loc 1 26 63 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:63 + mov.b32 {%rs3, %rs4}, %r5; + cvt.f32.bf16 %r27, %rs4; + cvt.f32.bf16 %r28, %rs3; + mov.b32 %r29, 0f00000000; +$L__tmp1: + .loc 2 50 30 // standard.py:50:30 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + sub.f32 %r30, %r29, %r26; + sub.f32 %r31, %r29, %r25; + .loc 2 50 29 // standard.py:50:29 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + mul.f32 %r32, %r30, 0f3FB8AA3B; + ex2.approx.f32 %r33, %r32; + mul.f32 %r34, %r31, 0f3FB8AA3B; + ex2.approx.f32 %r35, %r34; + .loc 2 50 20 // standard.py:50:20 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + add.f32 %r36, %r33, 0f3F800000; + add.f32 %r37, %r35, 0f3F800000; + mov.b32 %r38, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + div.full.f32 %r39, %r38, %r36; + div.full.f32 %r40, %r38, %r37; +$L__tmp2: + .loc 1 29 18 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:29:18 + mul.f32 %r41, %r40, %r25; + mul.f32 %r42, %r39, %r26; + .loc 1 31 18 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:31:18 + mul.f32 %r43, %r42, %r28; + mul.f32 %r44, %r41, %r27; + .loc 1 32 36 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:36 + cvt.rn.bf16x2.f32 %r9, %r44, %r43; + .loc 1 25 55 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:55 + mov.b32 {%rs5, %rs6}, %r2; + cvt.f32.bf16 %r45, %rs6; + cvt.f32.bf16 %r46, %rs5; + .loc 1 26 63 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:63 + mov.b32 {%rs7, %rs8}, %r6; + cvt.f32.bf16 %r47, %rs8; + cvt.f32.bf16 %r48, %rs7; +$L__tmp3: + .loc 2 50 30 // standard.py:50:30 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + sub.f32 %r49, %r29, %r46; + sub.f32 %r50, %r29, %r45; + .loc 2 50 29 // standard.py:50:29 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + mul.f32 %r51, %r49, 0f3FB8AA3B; + ex2.approx.f32 %r52, %r51; + mul.f32 %r53, %r50, 0f3FB8AA3B; + ex2.approx.f32 %r54, %r53; + .loc 2 50 20 // standard.py:50:20 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + add.f32 %r55, %r52, 0f3F800000; + add.f32 %r56, %r54, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + div.full.f32 %r57, %r38, %r55; + div.full.f32 %r58, %r38, %r56; +$L__tmp4: + .loc 1 29 18 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:29:18 + mul.f32 %r59, %r58, %r45; + mul.f32 %r60, %r57, %r46; + .loc 1 31 18 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:31:18 + mul.f32 %r61, %r60, %r48; + mul.f32 %r62, %r59, %r47; + .loc 1 32 36 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:36 + cvt.rn.bf16x2.f32 %r10, %r62, %r61; + .loc 1 25 55 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:55 + mov.b32 {%rs9, %rs10}, %r3; + cvt.f32.bf16 %r63, %rs10; + cvt.f32.bf16 %r64, %rs9; + .loc 1 26 63 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:63 + mov.b32 {%rs11, %rs12}, %r7; + cvt.f32.bf16 %r65, %rs12; + cvt.f32.bf16 %r66, %rs11; +$L__tmp5: + .loc 2 50 30 // standard.py:50:30 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + sub.f32 %r67, %r29, %r64; + sub.f32 %r68, %r29, %r63; + .loc 2 50 29 // standard.py:50:29 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + mul.f32 %r69, %r67, 0f3FB8AA3B; + ex2.approx.f32 %r70, %r69; + mul.f32 %r71, %r68, 0f3FB8AA3B; + ex2.approx.f32 %r72, %r71; + .loc 2 50 20 // standard.py:50:20 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + add.f32 %r73, %r70, 0f3F800000; + add.f32 %r74, %r72, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + div.full.f32 %r75, %r38, %r73; + div.full.f32 %r76, %r38, %r74; +$L__tmp6: + .loc 1 29 18 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:29:18 + mul.f32 %r77, %r76, %r63; + mul.f32 %r78, %r75, %r64; + .loc 1 31 18 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:31:18 + mul.f32 %r79, %r78, %r66; + mul.f32 %r80, %r77, %r65; + .loc 1 32 36 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:36 + cvt.rn.bf16x2.f32 %r11, %r80, %r79; + .loc 1 25 55 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:55 + mov.b32 {%rs13, %rs14}, %r4; + cvt.f32.bf16 %r81, %rs14; + cvt.f32.bf16 %r82, %rs13; + .loc 1 26 63 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:63 + mov.b32 {%rs15, %rs16}, %r8; + cvt.f32.bf16 %r83, %rs16; + cvt.f32.bf16 %r84, %rs15; +$L__tmp7: + .loc 2 50 30 // standard.py:50:30 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + sub.f32 %r85, %r29, %r82; + sub.f32 %r86, %r29, %r81; + .loc 2 50 29 // standard.py:50:29 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + mul.f32 %r87, %r85, 0f3FB8AA3B; + ex2.approx.f32 %r88, %r87; + mul.f32 %r89, %r86, 0f3FB8AA3B; + ex2.approx.f32 %r90, %r89; + .loc 2 50 20 // standard.py:50:20 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + add.f32 %r91, %r88, 0f3F800000; + add.f32 %r92, %r90, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + div.full.f32 %r93, %r38, %r91; + div.full.f32 %r94, %r38, %r92; +$L__tmp8: + .loc 1 29 18 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:29:18 + mul.f32 %r95, %r94, %r81; + mul.f32 %r96, %r93, %r82; + .loc 1 31 18 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:31:18 + mul.f32 %r97, %r96, %r84; + mul.f32 %r98, %r95, %r83; + .loc 1 32 36 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:36 + cvt.rn.bf16x2.f32 %r12, %r98, %r97; + // begin inline asm + st.global.v4.b32 [ %rd3 + 0 ], { %r9, %r10, %r11, %r12 }; + // end inline asm + .loc 1 32 4 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 307 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 115 +.b8 121 +.b8 97 +.b8 101 +.b8 51 +.b8 111 +.b8 107 +.b8 50 +.b8 120 +.b8 110 +.b8 122 +.b8 117 +.b8 120 +.b8 104 +.b8 106 +.b8 107 +.b8 120 +.b8 122 +.b8 104 +.b8 100 +.b8 99 +.b8 112 +.b8 99 +.b8 122 +.b8 54 +.b8 106 +.b8 99 +.b8 107 +.b8 99 +.b8 117 +.b8 51 +.b8 118 +.b8 118 +.b8 55 +.b8 101 +.b8 113 +.b8 98 +.b8 51 +.b8 112 +.b8 101 +.b8 119 +.b8 104 +.b8 114 +.b8 118 +.b8 113 +.b8 109 +.b8 105 +.b8 101 +.b8 114 +.b8 103 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 115 +.b8 121 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x24 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 111 +.b8 105 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 105 +.b8 108 +.b8 117 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 28 // DW_AT_call_line +.b8 22 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.source b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.source new file mode 100644 index 0000000000000000000000000000000000000000..951df97979069edd2b68e5fa5cfca0b583d64891 --- /dev/null +++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.source @@ -0,0 +1,126 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0) +#loc32 = loc("in_ptr0"(#loc)) +#loc33 = loc("out_ptr0"(#loc)) +#loc34 = loc("xnumel"(#loc)) +#loc56 = loc("x"(#loc25)) +module { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc35) + %xoffset_0 = arith.constant 1024 : i32 loc(#loc36) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc36) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc36) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc37) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32> loc(#loc38) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32> loc(#loc38) + %xmask = arith.constant true loc(#loc39) + %xmask_5 = arith.constant dense : tensor<1024xi1> loc(#loc39) + %x0 = arith.constant 12288 : i32 loc(#loc40) + %x0_6 = arith.constant 12288 : i32 loc(#loc40) + %x0_7 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc40) + %x0_8 = arith.remsi %xindex_4, %x0_7 : tensor<1024xi32> loc(#loc40) + %x1 = arith.constant 12288 : i32 loc(#loc41) + %x1_9 = arith.constant 12288 : i32 loc(#loc41) + %x1_10 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc41) + %x1_11 = arith.divsi %xindex_4, %x1_10 : tensor<1024xi32> loc(#loc41) + %tmp0 = arith.constant 24576 : i32 loc(#loc42) + %tmp0_12 = arith.constant 24576 : i32 loc(#loc42) + %tmp0_13 = arith.constant dense<24576> : tensor<1024xi32> loc(#loc42) + %tmp0_14 = arith.muli %tmp0_13, %x1_11 : tensor<1024xi32> loc(#loc42) + %tmp0_15 = arith.addi %x0_8, %tmp0_14 : tensor<1024xi32> loc(#loc43) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc44) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc44) + %tmp0_18 = tt.load %tmp0_17 : tensor<1024x!tt.ptr> loc(#loc45) + %tmp0_19 = arith.extf %tmp0_18 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc46) + %tmp5 = arith.constant 12288 : i32 loc(#loc47) + %tmp5_20 = arith.constant 12288 : i32 loc(#loc47) + %tmp5_21 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc47) + %tmp5_22 = arith.addi %tmp5_21, %x0_8 : tensor<1024xi32> loc(#loc47) + %tmp5_23 = arith.constant 24576 : i32 loc(#loc48) + %tmp5_24 = arith.constant 24576 : i32 loc(#loc48) + %tmp5_25 = arith.constant dense<24576> : tensor<1024xi32> loc(#loc48) + %tmp5_26 = arith.muli %tmp5_25, %x1_11 : tensor<1024xi32> loc(#loc48) + %tmp5_27 = arith.addi %tmp5_22, %tmp5_26 : tensor<1024xi32> loc(#loc49) + %tmp5_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc50) + %tmp5_29 = tt.addptr %tmp5_28, %tmp5_27 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc50) + %tmp5_30 = tt.load %tmp5_29 : tensor<1024x!tt.ptr> loc(#loc51) + %tmp5_31 = arith.extf %tmp5_30 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc52) + %tmp2 = tt.call @triton.language.standard.sigmoid__fp32S1024S__(%tmp0_19) : (tensor<1024xf32>) -> tensor<1024xf32> loc(#loc53) + %tmp3 = arith.mulf %tmp0_19, %tmp2 : tensor<1024xf32> loc(#loc54) + %tmp6 = arith.mulf %tmp3, %tmp5_31 : tensor<1024xf32> loc(#loc55) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc22) + %1 = tt.addptr %0, %xindex_4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc22) + %2 = arith.truncf %tmp6 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc23) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc23) + tt.return loc(#loc24) + } loc(#loc) + tt.func private @triton.language.standard.sigmoid__fp32S1024S__(%x: tensor<1024xf32> loc("x"(#loc25))) -> tensor<1024xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc26) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc26) + %0 = arith.subf %cst_0, %x : tensor<1024xf32> loc(#loc26) + %1 = math.exp %0 : tensor<1024xf32> loc(#loc27) + %c1_i32 = arith.constant 1 : i32 loc(#loc28) + %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc28) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc28) + %2 = arith.addf %cst_2, %1 : tensor<1024xf32> loc(#loc28) + %c1_i32_3 = arith.constant 1 : i32 loc(#loc29) + %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc29) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc29) + %3 = arith.divf %cst_5, %2 : tensor<1024xf32> loc(#loc29) + tt.return %3 : tensor<1024xf32> loc(#loc30) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1024xf32> loc(#loc31) + tt.return %4 : tensor<1024xf32> loc(#loc31) + } loc(#loc25) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:49) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4) +#loc35 = loc("xoffset"(#loc1)) +#loc36 = loc("xoffset"(#loc2)) +#loc37 = loc("xindex"(#loc3)) +#loc38 = loc("xindex"(#loc4)) +#loc39 = loc("xmask"(#loc5)) +#loc40 = loc("x0"(#loc6)) +#loc41 = loc("x1"(#loc7)) +#loc42 = loc("tmp0"(#loc8)) +#loc43 = loc("tmp0"(#loc9)) +#loc44 = loc("tmp0"(#loc10)) +#loc45 = loc("tmp0"(#loc11)) +#loc46 = loc("tmp0"(#loc12)) +#loc47 = loc("tmp5"(#loc13)) +#loc48 = loc("tmp5"(#loc14)) +#loc49 = loc("tmp5"(#loc15)) +#loc50 = loc("tmp5"(#loc16)) +#loc51 = loc("tmp5"(#loc17)) +#loc52 = loc("tmp5"(#loc18)) +#loc53 = loc("tmp2"(#loc19)) +#loc54 = loc("tmp3"(#loc20)) +#loc55 = loc("tmp6"(#loc21)) diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttgir b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..548225d3ff1d57c11706f8a5caeb149b792e4c90 --- /dev/null +++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttgir @@ -0,0 +1,93 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0) +#loc28 = loc("in_ptr0"(#loc)) +#loc29 = loc("out_ptr0"(#loc)) +#loc30 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<24576> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<12288> : tensor<1024xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc31) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc32) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc33) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc34) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc34) + %x0 = arith.remsi %xindex_5, %cst_0 : tensor<1024xi32, #blocked> loc(#loc35) + %x1 = arith.divsi %xindex_5, %cst_0 : tensor<1024xi32, #blocked> loc(#loc36) + %tmp0 = arith.muli %x1, %cst : tensor<1024xi32, #blocked> loc(#loc37) + %tmp0_6 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc38) + %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc39) + %tmp0_8 = tt.addptr %tmp0_7, %tmp0_6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc39) + %tmp0_9 = tt.load %tmp0_8 : tensor<1024x!tt.ptr, #blocked> loc(#loc40) + %tmp0_10 = arith.extf %tmp0_9 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc41) + %tmp5 = arith.addi %x0, %cst_0 : tensor<1024xi32, #blocked> loc(#loc42) + %tmp5_11 = arith.addi %tmp5, %tmp0 : tensor<1024xi32, #blocked> loc(#loc43) + %tmp5_12 = tt.addptr %tmp0_7, %tmp5_11 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc44) + %tmp5_13 = tt.load %tmp5_12 : tensor<1024x!tt.ptr, #blocked> loc(#loc45) + %tmp5_14 = arith.extf %tmp5_13 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc46) + %tmp2 = arith.subf %cst_1, %tmp0_10 : tensor<1024xf32, #blocked> loc(#loc50) + %tmp2_15 = math.exp %tmp2 : tensor<1024xf32, #blocked> loc(#loc51) + %tmp2_16 = arith.addf %tmp2_15, %cst_2 : tensor<1024xf32, #blocked> loc(#loc52) + %tmp2_17 = arith.divf %cst_2, %tmp2_16 : tensor<1024xf32, #blocked> loc(#loc53) + %tmp3 = arith.mulf %tmp0_10, %tmp2_17 : tensor<1024xf32, #blocked> loc(#loc48) + %tmp6 = arith.mulf %tmp3, %tmp5_14 : tensor<1024xf32, #blocked> loc(#loc49) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc25) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc25) + %2 = arith.truncf %tmp6 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc26) + tt.store %1, %2 : tensor<1024x!tt.ptr, #blocked> loc(#loc26) + tt.return loc(#loc27) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4) +#loc31 = loc("xoffset"(#loc2)) +#loc32 = loc("xoffset"(#loc3)) +#loc33 = loc("xindex"(#loc4)) +#loc34 = loc("xindex"(#loc5)) +#loc35 = loc("x0"(#loc6)) +#loc36 = loc("x1"(#loc7)) +#loc37 = loc("tmp0"(#loc8)) +#loc38 = loc("tmp0"(#loc9)) +#loc39 = loc("tmp0"(#loc10)) +#loc40 = loc("tmp0"(#loc11)) +#loc41 = loc("tmp0"(#loc12)) +#loc42 = loc("tmp5"(#loc13)) +#loc43 = loc("tmp5"(#loc14)) +#loc44 = loc("tmp5"(#loc15)) +#loc45 = loc("tmp5"(#loc16)) +#loc46 = loc("tmp5"(#loc17)) +#loc47 = loc("tmp2"(#loc19)) +#loc48 = loc("tmp3"(#loc23)) +#loc49 = loc("tmp6"(#loc24)) +#loc50 = loc(callsite(#loc18 at #loc47)) +#loc51 = loc(callsite(#loc20 at #loc47)) +#loc52 = loc(callsite(#loc21 at #loc47)) +#loc53 = loc(callsite(#loc22 at #loc47)) diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttir b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..29f3198e2e9a06f7ace4d791ebabf6c968a89561 --- /dev/null +++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttir @@ -0,0 +1,93 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0) +#loc28 = loc("in_ptr0"(#loc)) +#loc29 = loc("out_ptr0"(#loc)) +#loc30 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp2 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc50) + %tmp2_0 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc51) + %cst = arith.constant dense<24576> : tensor<1024xi32> loc(#loc3) + %cst_1 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc3) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc32) + %xoffset_2 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc33) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc34) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32> loc(#loc35) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32> loc(#loc35) + %x0 = arith.remsi %xindex_4, %cst_1 : tensor<1024xi32> loc(#loc36) + %x1 = arith.divsi %xindex_4, %cst_1 : tensor<1024xi32> loc(#loc37) + %tmp0 = arith.muli %x1, %cst : tensor<1024xi32> loc(#loc38) + %tmp0_5 = arith.addi %x0, %tmp0 : tensor<1024xi32> loc(#loc39) + %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc40) + %tmp0_7 = tt.addptr %tmp0_6, %tmp0_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc40) + %tmp0_8 = tt.load %tmp0_7 : tensor<1024x!tt.ptr> loc(#loc41) + %tmp0_9 = arith.extf %tmp0_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc42) + %tmp5 = arith.addi %x0, %cst_1 : tensor<1024xi32> loc(#loc43) + %tmp5_10 = arith.addi %tmp5, %tmp0 : tensor<1024xi32> loc(#loc44) + %tmp5_11 = tt.addptr %tmp0_6, %tmp5_10 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc45) + %tmp5_12 = tt.load %tmp5_11 : tensor<1024x!tt.ptr> loc(#loc46) + %tmp5_13 = arith.extf %tmp5_12 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc47) + %tmp2_14 = arith.subf %tmp2, %tmp0_9 : tensor<1024xf32> loc(#loc50) + %tmp2_15 = math.exp %tmp2_14 : tensor<1024xf32> loc(#loc52) + %tmp2_16 = arith.addf %tmp2_15, %tmp2_0 : tensor<1024xf32> loc(#loc53) + %tmp2_17 = arith.divf %tmp2_0, %tmp2_16 : tensor<1024xf32> loc(#loc54) + %tmp3 = arith.mulf %tmp0_9, %tmp2_17 : tensor<1024xf32> loc(#loc48) + %tmp6 = arith.mulf %tmp3, %tmp5_13 : tensor<1024xf32> loc(#loc49) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc25) + %1 = tt.addptr %0, %xindex_4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc25) + %2 = arith.truncf %tmp6 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc26) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc26) + tt.return loc(#loc27) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22) +#loc3 = loc(unknown) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4) +#loc31 = loc("tmp2"(#loc2)) +#loc32 = loc("xoffset"(#loc4)) +#loc33 = loc("xoffset"(#loc5)) +#loc34 = loc("xindex"(#loc6)) +#loc35 = loc("xindex"(#loc7)) +#loc36 = loc("x0"(#loc8)) +#loc37 = loc("x1"(#loc9)) +#loc38 = loc("tmp0"(#loc10)) +#loc39 = loc("tmp0"(#loc11)) +#loc40 = loc("tmp0"(#loc12)) +#loc41 = loc("tmp0"(#loc13)) +#loc42 = loc("tmp0"(#loc14)) +#loc43 = loc("tmp5"(#loc15)) +#loc44 = loc("tmp5"(#loc16)) +#loc45 = loc("tmp5"(#loc17)) +#loc46 = loc("tmp5"(#loc18)) +#loc47 = loc("tmp5"(#loc19)) +#loc48 = loc("tmp3"(#loc23)) +#loc49 = loc("tmp6"(#loc24)) +#loc50 = loc(callsite(#loc1 at #loc31)) +#loc51 = loc(callsite(#loc3 at #loc31)) +#loc52 = loc(callsite(#loc20 at #loc31)) +#loc53 = loc(callsite(#loc21 at #loc31)) +#loc54 = loc(callsite(#loc22 at #loc31)) diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a28a49c96135d2b07c5cd24c1d5f8163df36d065 --- /dev/null +++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json"}} \ No newline at end of file diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..6d1b1d4346a1ea336d93d649d1c97c5a552d1f27 Binary files /dev/null and b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin differ diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json new file mode 100644 index 0000000000000000000000000000000000000000..199f6604b0d27bb528577e43eddfcb99a27727a8 --- /dev/null +++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json @@ -0,0 +1 @@ +{"hash": "f8131c6b8caf67e5ea2bfae0e789f70c79b5f1eae13909e1ca197e40365679ca", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0"} \ No newline at end of file diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..614b3f766cc217d39ae9e2922c1a0f72d0fad00f --- /dev/null +++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir @@ -0,0 +1,67 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 9, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 1, !dbg !9 + %10 = and i32 %9, 510, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = sdiv i32 %11, 128, !dbg !11 + %13 = mul i32 %12, 128, !dbg !12 + %.decomposed = sub i32 %11, %13, !dbg !12 + %14 = srem i32 %12, 2304, !dbg !13 + %15 = sdiv i32 %11, 294912, !dbg !14 + %16 = shl nsw i32 %15, 7, !dbg !15 + %17 = add nsw i32 %16, %.decomposed, !dbg !16 + %18 = shl nsw i32 %14, 12, !dbg !17 + %19 = add nsw i32 %17, %18, !dbg !18 + %20 = sext i32 %19 to i64, !dbg !19 + %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19 + %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %21) #2, !dbg !20 + %23 = sext i32 %11 to i64, !dbg !21 + %24 = getelementptr bfloat, ptr addrspace(1) %1, i64 %23, !dbg !21 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %22, ptr addrspace(1) %24) #2, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0", linkageName: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 23, column: 19, scope: !4) +!13 = !DILocation(line: 24, column: 28, scope: !4) +!14 = !DILocation(line: 25, column: 19, scope: !4) +!15 = !DILocation(line: 27, column: 39, scope: !4) +!16 = !DILocation(line: 27, column: 35, scope: !4) +!17 = !DILocation(line: 27, column: 49, scope: !4) +!18 = !DILocation(line: 27, column: 44, scope: !4) +!19 = !DILocation(line: 27, column: 30, scope: !4) +!20 = !DILocation(line: 27, column: 54, scope: !4) +!21 = !DILocation(line: 28, column: 25, scope: !4) +!22 = !DILocation(line: 28, column: 36, scope: !4) +!23 = !DILocation(line: 28, column: 4, scope: !4) diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..e16565cd6ba1d8b88853d9005fd2702cef131fc8 --- /dev/null +++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx @@ -0,0 +1,329 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 // -- Begin function triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 + // @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 +.visible .entry triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0( + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_1, + .param .u32 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_4 +) +.reqntid 256 +{ + .reg .b32 %r<28>; + .reg .b64 %rd<5>; + .loc 1 18 0 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_0]; + ld.param.b64 %rd4, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_1]; +$L__tmp0: + .loc 1 20 28 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:20:28 + mov.u32 %r2, %ctaid.x; + .loc 1 20 33 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:20:33 + shl.b32 %r3, %r2, 9; + .loc 1 21 36 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:21:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 21 23 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:21:23 + or.b32 %r7, %r6, %r3; + .loc 1 24 21 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:24:21 + bfe.s32 %r8, %r2, 22, 1; + shr.u32 %r9, %r8, 25; + add.s32 %r10, %r7, %r9; + shr.s32 %r11, %r10, 7; + .loc 1 23 19 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:23:19 + and.b32 %r12, %r10, -128; + sub.s32 %r13, %r7, %r12; + .loc 1 24 28 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:24:28 + mul.hi.s32 %r14, %r11, 954437177; + shr.u32 %r15, %r14, 31; + shr.u32 %r16, %r14, 9; + add.s32 %r17, %r16, %r15; + mul.lo.s32 %r18, %r17, 2304; + sub.s32 %r19, %r11, %r18; + .loc 1 25 19 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:25:19 + mul.hi.s32 %r20, %r7, 954437177; + shr.u32 %r21, %r20, 31; + shr.s32 %r22, %r20, 16; + add.s32 %r23, %r22, %r21; + .loc 1 27 39 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:39 + shl.b32 %r24, %r23, 7; + .loc 1 27 35 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:35 + add.s32 %r25, %r24, %r13; + .loc 1 27 49 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:49 + shl.b32 %r26, %r19, 12; + .loc 1 27 44 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:44 + add.s32 %r27, %r25, %r26; + .loc 1 27 30 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:30 + mad.wide.s32 %rd1, %r27, 2, %rd3; + .loc 1 27 54 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:54 + // begin inline asm + mov.u32 %r1, 0x0; + ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 28 25 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:25 + mad.wide.s32 %rd2, %r7, 2, %rd4; + .loc 1 28 36 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:36 + // begin inline asm + st.global.b32 [ %rd2 + 0 ], { %r1 }; + // end inline asm + .loc 1 28 4 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 101 +.b8 105 +.b8 112 +.b8 106 +.b8 120 +.b8 97 +.b8 117 +.b8 115 +.b8 97 +.b8 117 +.b8 122 +.b8 108 +.b8 52 +.b8 109 +.b8 99 +.b8 99 +.b8 50 +.b8 51 +.b8 51 +.b8 102 +.b8 112 +.b8 101 +.b8 117 +.b8 98 +.b8 102 +.b8 115 +.b8 51 +.b8 117 +.b8 107 +.b8 53 +.b8 110 +.b8 105 +.b8 53 +.b8 98 +.b8 106 +.b8 113 +.b8 98 +.b8 108 +.b8 50 +.b8 113 +.b8 119 +.b8 116 +.b8 111 +.b8 119 +.b8 106 +.b8 119 +.b8 114 +.b8 108 +.b8 55 +.b8 99 +.b8 100 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 101 +.b8 105 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source new file mode 100644 index 0000000000000000000000000000000000000000..9fa878988fb486d9c354c976b2e285d4281da372 --- /dev/null +++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source @@ -0,0 +1,90 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("out_ptr0"(#loc)) +#loc23 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_1 = arith.constant 512 : i32 loc(#loc26) + %xoffset_2 = arith.constant 512 : i32 loc(#loc26) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc28) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc28) + %xmask = arith.constant true loc(#loc29) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc29) + %x0 = arith.constant 128 : i32 loc(#loc30) + %x0_7 = arith.constant 128 : i32 loc(#loc30) + %x0_8 = arith.constant dense<128> : tensor<512xi32> loc(#loc30) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc30) + %x1 = arith.constant 128 : i32 loc(#loc31) + %x1_10 = arith.constant 128 : i32 loc(#loc31) + %x1_11 = arith.constant dense<128> : tensor<512xi32> loc(#loc31) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc31) + %x1_13 = arith.constant 2304 : i32 loc(#loc32) + %x1_14 = arith.constant 2304 : i32 loc(#loc32) + %x1_15 = arith.constant dense<2304> : tensor<512xi32> loc(#loc32) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<512xi32> loc(#loc32) + %x2 = arith.constant 294912 : i32 loc(#loc33) + %x2_17 = arith.constant 294912 : i32 loc(#loc33) + %x2_18 = arith.constant dense<294912> : tensor<512xi32> loc(#loc33) + %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<512xi32> loc(#loc33) + %tmp0 = arith.constant 128 : i32 loc(#loc34) + %tmp0_20 = arith.constant 128 : i32 loc(#loc34) + %tmp0_21 = arith.constant dense<128> : tensor<512xi32> loc(#loc34) + %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<512xi32> loc(#loc34) + %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<512xi32> loc(#loc35) + %tmp0_24 = arith.constant 4096 : i32 loc(#loc36) + %tmp0_25 = arith.constant 4096 : i32 loc(#loc36) + %tmp0_26 = arith.constant dense<4096> : tensor<512xi32> loc(#loc36) + %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<512xi32> loc(#loc36) + %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<512xi32> loc(#loc37) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc38) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc38) + %tmp0_31 = tt.load %tmp0_30 : tensor<512x!tt.ptr> loc(#loc39) + %tmp0_32 = arith.extf %tmp0_31 : tensor<512xbf16> to tensor<512xf32> loc(#loc40) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc18) + %2 = arith.truncf %tmp0_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc19) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:63) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4) +#loc24 = loc("xnumel"(#loc1)) +#loc25 = loc("xoffset"(#loc2)) +#loc26 = loc("xoffset"(#loc3)) +#loc27 = loc("xindex"(#loc4)) +#loc28 = loc("xindex"(#loc5)) +#loc29 = loc("xmask"(#loc6)) +#loc30 = loc("x0"(#loc7)) +#loc31 = loc("x1"(#loc8)) +#loc32 = loc("x1"(#loc9)) +#loc33 = loc("x2"(#loc10)) +#loc34 = loc("tmp0"(#loc11)) +#loc35 = loc("tmp0"(#loc12)) +#loc36 = loc("tmp0"(#loc13)) +#loc37 = loc("tmp0"(#loc14)) +#loc38 = loc("tmp0"(#loc15)) +#loc39 = loc("tmp0"(#loc16)) +#loc40 = loc("tmp0"(#loc17)) diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..ebdddc82dda054c40857372ef4f61d3bdf535988 --- /dev/null +++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir @@ -0,0 +1,66 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<512xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<2304> : tensor<512xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<294912> : tensor<512xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc22) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc23) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc24) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc25) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc25) + %x0 = arith.remsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc26) + %x1 = arith.divsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc27) + %x1_6 = arith.remsi %x1, %cst_0 : tensor<512xi32, #blocked> loc(#loc28) + %x2 = arith.divsi %xindex_5, %cst_1 : tensor<512xi32, #blocked> loc(#loc29) + %tmp0 = arith.muli %x2, %cst : tensor<512xi32, #blocked> loc(#loc30) + %tmp0_7 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc31) + %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<512xi32, #blocked> loc(#loc32) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32, #blocked> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr, #blocked> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc16) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<512x!tt.ptr, #blocked> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4) +#loc22 = loc("xoffset"(#loc2)) +#loc23 = loc("xoffset"(#loc3)) +#loc24 = loc("xindex"(#loc4)) +#loc25 = loc("xindex"(#loc5)) +#loc26 = loc("x0"(#loc6)) +#loc27 = loc("x1"(#loc7)) +#loc28 = loc("x1"(#loc8)) +#loc29 = loc("x2"(#loc9)) +#loc30 = loc("tmp0"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..400988e57e1e7ffec85dde64c8b81715c4cde48a --- /dev/null +++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir @@ -0,0 +1,65 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<4096> : tensor<512xi32> loc(#loc22) + %x2 = arith.constant dense<294912> : tensor<512xi32> loc(#loc23) + %x1 = arith.constant dense<2304> : tensor<512xi32> loc(#loc24) + %cst = arith.constant dense<128> : tensor<512xi32> loc(#loc4) + %c512_i32 = arith.constant 512 : i32 loc(#loc4) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc26) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc28) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc28) + %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32> loc(#loc29) + %x1_3 = arith.divsi %xindex_2, %cst : tensor<512xi32> loc(#loc30) + %x1_4 = arith.remsi %x1_3, %x1 : tensor<512xi32> loc(#loc24) + %x2_5 = arith.divsi %xindex_2, %x2 : tensor<512xi32> loc(#loc23) + %tmp0_6 = arith.muli %x2_5, %cst : tensor<512xi32> loc(#loc31) + %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<512xi32> loc(#loc32) + %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<512xi32> loc(#loc22) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc16) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<512x!tt.ptr> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28) +#loc4 = loc(unknown) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4) +#loc22 = loc("tmp0"(#loc1)) +#loc23 = loc("x2"(#loc2)) +#loc24 = loc("x1"(#loc3)) +#loc25 = loc("xoffset"(#loc5)) +#loc26 = loc("xoffset"(#loc6)) +#loc27 = loc("xindex"(#loc7)) +#loc28 = loc("xindex"(#loc8)) +#loc29 = loc("x0"(#loc9)) +#loc30 = loc("x1"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ada7c60ba3110e1e644c1f3bf8d906d795caf3cf --- /dev/null +++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.json"}} \ No newline at end of file diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..1683dae7a0d86b1c7eb37340f3816361e332c0a6 Binary files /dev/null and b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.cubin differ diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4ad91ec628268b4ef21b73c7aab02c148c72ad08 --- /dev/null +++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"hash": "fbdc4598492d1c558bd33291606df1a4f6cc22e374a61e71948b2104d91b5ee3", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"} \ No newline at end of file diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..c91141e4af47dfed7c5b5ddda48f271c80b40f54 --- /dev/null +++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.llir @@ -0,0 +1,565 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 { +__nv_rsqrtf.exit: + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %9 = icmp samesign ult i32 %8, 2048, !dbg !9 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %11 = shl nuw nsw i32 %10, 2, !dbg !10 + %12 = and i32 %11, 2044, !dbg !10 + %13 = shl i32 %8, 12, !dbg !11 + %14 = or disjoint i32 %12, %13 + %15 = sext i32 %14 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13 + %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %18 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %16, i64 %17, i1 %9) #6, !dbg !14 + %19 = extractvalue { i32, i32 } %18, 1, !dbg !14 + %20 = bitcast i32 %19 to <2 x bfloat>, !dbg !14 + %21 = extractelement <2 x bfloat> %20, i64 1, !dbg !14 + %22 = fpext bfloat %21 to float, !dbg !15 + %23 = extractelement <2 x bfloat> %20, i64 0, !dbg !14 + %24 = fpext bfloat %23 to float, !dbg !15 + %25 = extractvalue { i32, i32 } %18, 0, !dbg !14 + %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !14 + %27 = extractelement <2 x bfloat> %26, i64 1, !dbg !14 + %28 = fpext bfloat %27 to float, !dbg !15 + %29 = extractelement <2 x bfloat> %26, i64 0, !dbg !14 + %30 = fpext bfloat %29 to float, !dbg !15 + %31 = select i1 %9, float %30, float 0.000000e+00, !dbg !16 + %32 = select i1 %9, float %28, float 0.000000e+00, !dbg !16 + %33 = select i1 %9, float %24, float 0.000000e+00, !dbg !16 + %34 = select i1 %9, float %22, float 0.000000e+00, !dbg !16 + %35 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13 + %36 = getelementptr i8, ptr addrspace(1) %35, i64 4096, !dbg !13 + %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %36, i64 %37, i1 %9) #6, !dbg !14 + %39 = extractvalue { i32, i32 } %38, 0, !dbg !14 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !14 + %41 = extractelement <2 x bfloat> %40, i64 0, !dbg !14 + %42 = fpext bfloat %41 to float, !dbg !15 + %43 = fsub float %42, %31, !dbg !17 + %44 = select i1 %9, float 2.000000e+00, float 1.000000e+00, !dbg !22 + %45 = tail call float @llvm.nvvm.div.full(float %43, float %44), !dbg !23 + %46 = fadd float %31, %45, !dbg !24 + %47 = fsub float %42, %46, !dbg !25 + %48 = fmul float %43, %47, !dbg !26 + %49 = fadd float %48, 0.000000e+00, !dbg !27 + %50 = extractelement <2 x bfloat> %40, i64 1, !dbg !14 + %51 = fpext bfloat %50 to float, !dbg !15 + %52 = fsub float %51, %32, !dbg !17 + %53 = tail call float @llvm.nvvm.div.full(float %52, float %44), !dbg !23 + %54 = fadd float %32, %53, !dbg !24 + %55 = fsub float %51, %54, !dbg !25 + %56 = fmul float %52, %55, !dbg !26 + %57 = fadd float %56, 0.000000e+00, !dbg !27 + %58 = extractvalue { i32, i32 } %38, 1, !dbg !14 + %59 = bitcast i32 %58 to <2 x bfloat>, !dbg !14 + %60 = extractelement <2 x bfloat> %59, i64 0, !dbg !14 + %61 = fpext bfloat %60 to float, !dbg !15 + %62 = fsub float %61, %33, !dbg !17 + %63 = tail call float @llvm.nvvm.div.full(float %62, float %44), !dbg !23 + %64 = fadd float %33, %63, !dbg !24 + %65 = fsub float %61, %64, !dbg !25 + %66 = fmul float %62, %65, !dbg !26 + %67 = fadd float %66, 0.000000e+00, !dbg !27 + %68 = extractelement <2 x bfloat> %59, i64 1, !dbg !14 + %69 = fpext bfloat %68 to float, !dbg !15 + %70 = fsub float %69, %34, !dbg !17 + %71 = tail call float @llvm.nvvm.div.full(float %70, float %44), !dbg !23 + %72 = fadd float %34, %71, !dbg !24 + %73 = fsub float %69, %72, !dbg !25 + %74 = fmul float %70, %73, !dbg !26 + %75 = fadd float %74, 0.000000e+00, !dbg !27 + %76 = select i1 %9, float %46, float 0.000000e+00, !dbg !16 + %77 = select i1 %9, float %54, float 0.000000e+00, !dbg !16 + %78 = select i1 %9, float %64, float 0.000000e+00, !dbg !16 + %79 = select i1 %9, float %72, float 0.000000e+00, !dbg !16 + %80 = select i1 %9, float %67, float 0.000000e+00, !dbg !28 + %81 = select i1 %9, float %75, float 0.000000e+00, !dbg !28 + %82 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %83 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %84 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %85 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %86 = and i32 %10, 511, !dbg !10 + %87 = and i32 %10, 31, !dbg !10 + %88 = lshr i32 %86, 5, !dbg !10 + %89 = fsub float %77, %76, !dbg !29 + %90 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !32 + %91 = fcmp oeq float %90, 0.000000e+00, !dbg !33 + %92 = tail call float @llvm.nvvm.div.full(float %83, float %90), !dbg !34 + %93 = select i1 %91, float 0.000000e+00, float %92, !dbg !35 + %94 = fmul float %89, %93, !dbg !36 + %95 = fadd float %76, %94, !dbg !37 + %96 = fadd float %49, %57, !dbg !38 + %97 = select i1 %9, float %96, float 0.000000e+00, !dbg !38 + %98 = fmul float %89, %89, !dbg !39 + %99 = fmul float %98, %82, !dbg !40 + %100 = fmul float %99, %93, !dbg !41 + %101 = fadd float %97, %100, !dbg !42 + %102 = fsub float %78, %95, !dbg !29 + %103 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !32 + %104 = fcmp oeq float %103, 0.000000e+00, !dbg !33 + %105 = tail call float @llvm.nvvm.div.full(float %84, float %103), !dbg !34 + %106 = select i1 %104, float 0.000000e+00, float %105, !dbg !35 + %107 = fmul float %106, %102, !dbg !36 + %108 = fadd float %95, %107, !dbg !37 + %109 = fadd float %80, %101, !dbg !38 + %110 = fmul float %102, %102, !dbg !39 + %111 = fmul float %90, %110, !dbg !40 + %112 = fmul float %106, %111, !dbg !41 + %113 = fadd float %109, %112, !dbg !42 + %114 = fsub float %79, %108, !dbg !29 + %115 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !32 + %116 = fcmp oeq float %115, 0.000000e+00, !dbg !33 + %117 = tail call float @llvm.nvvm.div.full(float %85, float %115), !dbg !34 + %118 = select i1 %116, float 0.000000e+00, float %117, !dbg !35 + %119 = fmul float %118, %114, !dbg !36 + %120 = fadd float %108, %119, !dbg !37 + %121 = fadd float %81, %113, !dbg !38 + %122 = fmul float %114, %114, !dbg !39 + %123 = fmul float %103, %122, !dbg !40 + %124 = fmul float %118, %123, !dbg !41 + %125 = fadd float %121, %124, !dbg !42 + %126 = bitcast float %120 to i32, !dbg !30 + %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 16, i32 31), !dbg !30 + %128 = bitcast i32 %127 to float, !dbg !30 + %129 = bitcast float %125 to i32, !dbg !30 + %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 16, i32 31), !dbg !30 + %131 = bitcast i32 %130 to float, !dbg !30 + %132 = bitcast float %115 to i32, !dbg !30 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !30 + %134 = bitcast i32 %133 to float, !dbg !30 + %135 = fsub float %128, %120, !dbg !29 + %136 = fadd float %115, %134, !dbg !32 + %137 = fcmp oeq float %136, 0.000000e+00, !dbg !33 + %138 = tail call float @llvm.nvvm.div.full(float %134, float %136), !dbg !34 + %139 = select i1 %137, float 0.000000e+00, float %138, !dbg !35 + %140 = fmul float %139, %135, !dbg !36 + %141 = fadd float %120, %140, !dbg !37 + %142 = fadd float %125, %131, !dbg !38 + %143 = fmul float %135, %135, !dbg !39 + %144 = fmul float %115, %143, !dbg !40 + %145 = fmul float %139, %144, !dbg !41 + %146 = fadd float %142, %145, !dbg !42 + %147 = bitcast float %141 to i32, !dbg !30 + %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %147, i32 8, i32 31), !dbg !30 + %149 = bitcast i32 %148 to float, !dbg !30 + %150 = bitcast float %146 to i32, !dbg !30 + %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 8, i32 31), !dbg !30 + %152 = bitcast i32 %151 to float, !dbg !30 + %153 = bitcast float %136 to i32, !dbg !30 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !30 + %155 = bitcast i32 %154 to float, !dbg !30 + %156 = fsub float %149, %141, !dbg !29 + %157 = fadd float %136, %155, !dbg !32 + %158 = fcmp oeq float %157, 0.000000e+00, !dbg !33 + %159 = tail call float @llvm.nvvm.div.full(float %155, float %157), !dbg !34 + %160 = select i1 %158, float 0.000000e+00, float %159, !dbg !35 + %161 = fmul float %156, %160, !dbg !36 + %162 = fadd float %141, %161, !dbg !37 + %163 = fadd float %146, %152, !dbg !38 + %164 = fmul float %156, %156, !dbg !39 + %165 = fmul float %136, %164, !dbg !40 + %166 = fmul float %160, %165, !dbg !41 + %167 = fadd float %163, %166, !dbg !42 + %168 = bitcast float %162 to i32, !dbg !30 + %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 4, i32 31), !dbg !30 + %170 = bitcast i32 %169 to float, !dbg !30 + %171 = bitcast float %167 to i32, !dbg !30 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 4, i32 31), !dbg !30 + %173 = bitcast i32 %172 to float, !dbg !30 + %174 = bitcast float %157 to i32, !dbg !30 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !30 + %176 = bitcast i32 %175 to float, !dbg !30 + %177 = fsub float %170, %162, !dbg !29 + %178 = fadd float %157, %176, !dbg !32 + %179 = fcmp oeq float %178, 0.000000e+00, !dbg !33 + %180 = tail call float @llvm.nvvm.div.full(float %176, float %178), !dbg !34 + %181 = select i1 %179, float 0.000000e+00, float %180, !dbg !35 + %182 = fmul float %177, %181, !dbg !36 + %183 = fadd float %162, %182, !dbg !37 + %184 = fadd float %167, %173, !dbg !38 + %185 = fmul float %177, %177, !dbg !39 + %186 = fmul float %157, %185, !dbg !40 + %187 = fmul float %181, %186, !dbg !41 + %188 = fadd float %184, %187, !dbg !42 + %189 = bitcast float %183 to i32, !dbg !30 + %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 2, i32 31), !dbg !30 + %191 = bitcast i32 %190 to float, !dbg !30 + %192 = bitcast float %188 to i32, !dbg !30 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 2, i32 31), !dbg !30 + %194 = bitcast i32 %193 to float, !dbg !30 + %195 = bitcast float %178 to i32, !dbg !30 + %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !30 + %197 = bitcast i32 %196 to float, !dbg !30 + %198 = fsub float %191, %183, !dbg !29 + %199 = fadd float %178, %197, !dbg !32 + %200 = fcmp oeq float %199, 0.000000e+00, !dbg !33 + %201 = tail call float @llvm.nvvm.div.full(float %197, float %199), !dbg !34 + %202 = select i1 %200, float 0.000000e+00, float %201, !dbg !35 + %203 = fmul float %198, %202, !dbg !36 + %204 = fadd float %183, %203, !dbg !37 + %205 = fadd float %188, %194, !dbg !38 + %206 = fmul float %198, %198, !dbg !39 + %207 = fmul float %178, %206, !dbg !40 + %208 = fmul float %202, %207, !dbg !41 + %209 = fadd float %205, %208, !dbg !42 + %210 = bitcast float %204 to i32, !dbg !30 + %211 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %210, i32 1, i32 31), !dbg !30 + %212 = bitcast i32 %211 to float, !dbg !30 + %213 = bitcast float %209 to i32, !dbg !30 + %214 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !30 + %215 = bitcast i32 %214 to float, !dbg !30 + %216 = bitcast float %199 to i32, !dbg !30 + %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !30 + %218 = bitcast i32 %217 to float, !dbg !30 + %219 = fsub float %212, %204, !dbg !29 + %220 = fadd float %199, %218, !dbg !32 + %221 = fcmp oeq float %220, 0.000000e+00, !dbg !33 + %222 = tail call float @llvm.nvvm.div.full(float %218, float %220), !dbg !34 + %223 = select i1 %221, float 0.000000e+00, float %222, !dbg !35 + %224 = fmul float %219, %223, !dbg !36 + %225 = fadd float %204, %224, !dbg !37 + %226 = fadd float %209, %215, !dbg !38 + %227 = fmul float %219, %219, !dbg !39 + %228 = fmul float %199, %227, !dbg !40 + %229 = fmul float %223, %228, !dbg !41 + %230 = fadd float %226, %229, !dbg !42 + %231 = icmp eq i32 %87, 0, !dbg !30 + %232 = getelementptr float, ptr addrspace(3) @global_smem, i32 %88, !dbg !30 + %233 = bitcast float %225 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %232, <1 x i32> %233, i1 %231) #6, !dbg !30 + %234 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %88, !dbg !30 + %235 = bitcast float %230 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %234, <1 x i32> %235, i1 %231) #6, !dbg !30 + %236 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %88, !dbg !30 + %237 = bitcast float %220 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %236, <1 x i32> %237, i1 %231) #6, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30 + %238 = icmp samesign ult i32 %86, 16, !dbg !30 + %239 = getelementptr float, ptr addrspace(3) @global_smem, i32 %86, !dbg !30 + %240 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %239, i1 %238) #6, !dbg !30 + %241 = bitcast i32 %240 to float, !dbg !30 + %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %86, !dbg !30 + %243 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %242, i1 %238) #6, !dbg !30 + %244 = bitcast i32 %243 to float, !dbg !30 + %245 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %86, !dbg !30 + %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %238) #6, !dbg !30 + %247 = bitcast i32 %246 to float, !dbg !30 + %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 8, i32 31), !dbg !30 + %249 = bitcast i32 %248 to float, !dbg !30 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 8, i32 31), !dbg !30 + %251 = bitcast i32 %250 to float, !dbg !30 + %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !30 + %253 = bitcast i32 %252 to float, !dbg !30 + %254 = fsub float %249, %241, !dbg !29 + %255 = fadd float %247, %253, !dbg !32 + %256 = fcmp oeq float %255, 0.000000e+00, !dbg !33 + %257 = tail call float @llvm.nvvm.div.full(float %253, float %255), !dbg !34 + %258 = select i1 %256, float 0.000000e+00, float %257, !dbg !35 + %259 = fmul float %254, %258, !dbg !36 + %260 = fadd float %259, %241, !dbg !37 + %261 = fadd float %244, %251, !dbg !38 + %262 = fmul float %254, %254, !dbg !39 + %263 = fmul float %262, %247, !dbg !40 + %264 = fmul float %263, %258, !dbg !41 + %265 = fadd float %261, %264, !dbg !42 + %266 = bitcast float %260 to i32, !dbg !30 + %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 4, i32 31), !dbg !30 + %268 = bitcast i32 %267 to float, !dbg !30 + %269 = bitcast float %265 to i32, !dbg !30 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !30 + %271 = bitcast i32 %270 to float, !dbg !30 + %272 = bitcast float %255 to i32, !dbg !30 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !30 + %274 = bitcast i32 %273 to float, !dbg !30 + %275 = fsub float %268, %260, !dbg !29 + %276 = fadd float %255, %274, !dbg !32 + %277 = fcmp oeq float %276, 0.000000e+00, !dbg !33 + %278 = tail call float @llvm.nvvm.div.full(float %274, float %276), !dbg !34 + %279 = select i1 %277, float 0.000000e+00, float %278, !dbg !35 + %280 = fmul float %275, %279, !dbg !36 + %281 = fadd float %260, %280, !dbg !37 + %282 = fadd float %265, %271, !dbg !38 + %283 = fmul float %275, %275, !dbg !39 + %284 = fmul float %255, %283, !dbg !40 + %285 = fmul float %279, %284, !dbg !41 + %286 = fadd float %282, %285, !dbg !42 + %287 = bitcast float %281 to i32, !dbg !30 + %288 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %287, i32 2, i32 31), !dbg !30 + %289 = bitcast i32 %288 to float, !dbg !30 + %290 = bitcast float %286 to i32, !dbg !30 + %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 2, i32 31), !dbg !30 + %292 = bitcast i32 %291 to float, !dbg !30 + %293 = bitcast float %276 to i32, !dbg !30 + %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !30 + %295 = bitcast i32 %294 to float, !dbg !30 + %296 = fsub float %289, %281, !dbg !29 + %297 = fadd float %276, %295, !dbg !32 + %298 = fcmp oeq float %297, 0.000000e+00, !dbg !33 + %299 = tail call float @llvm.nvvm.div.full(float %295, float %297), !dbg !34 + %300 = select i1 %298, float 0.000000e+00, float %299, !dbg !35 + %301 = fmul float %296, %300, !dbg !36 + %302 = fadd float %281, %301, !dbg !37 + %303 = fadd float %286, %292, !dbg !38 + %304 = fmul float %296, %296, !dbg !39 + %305 = fmul float %276, %304, !dbg !40 + %306 = fmul float %300, %305, !dbg !41 + %307 = fadd float %303, %306, !dbg !42 + %308 = bitcast float %302 to i32, !dbg !30 + %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 1, i32 31), !dbg !30 + %310 = bitcast i32 %309 to float, !dbg !30 + %311 = bitcast float %307 to i32, !dbg !30 + %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 1, i32 31), !dbg !30 + %313 = bitcast i32 %312 to float, !dbg !30 + %314 = bitcast float %297 to i32, !dbg !30 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !30 + %316 = bitcast i32 %315 to float, !dbg !30 + %317 = fsub float %310, %302, !dbg !29 + %318 = fadd float %297, %316, !dbg !32 + %319 = fcmp oeq float %318, 0.000000e+00, !dbg !33 + %320 = tail call float @llvm.nvvm.div.full(float %316, float %318), !dbg !34 + %321 = select i1 %319, float 0.000000e+00, float %320, !dbg !35 + %322 = fmul float %317, %321, !dbg !36 + %323 = fadd float %302, %322, !dbg !37 + %324 = fadd float %307, %313, !dbg !38 + %325 = fmul float %317, %317, !dbg !39 + %326 = fmul float %297, %325, !dbg !40 + %327 = fmul float %321, %326, !dbg !41 + %328 = fadd float %324, %327, !dbg !42 + %329 = and i32 %10, 15, !dbg !30 + %330 = icmp eq i32 %329, 0, !dbg !30 + %331 = and i1 %238, %330, !dbg !30 + %332 = bitcast float %323 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %239, <1 x i32> %332, i1 %331) #6, !dbg !30 + %333 = bitcast float %328 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %333, i1 %331) #6, !dbg !30 + %334 = bitcast float %318 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %334, i1 %331) #6, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30 + %335 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !30 + %336 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !30 + %337 = tail call float @llvm.nvvm.div.full(float %336, float 4.096000e+03), !dbg !43 + %338 = fadd float %337, 0x3EB0C6F7A0000000, !dbg !44 + %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i15 = icmp eq i32 %342, 0, !dbg !45 + br i1 %.not.i15, label %345, label %343, !dbg !45 + +343: ; preds = %__nv_rsqrtf.exit + %344 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !45 + br label %__nv_rsqrtf.exit17, !dbg !45 + +345: ; preds = %__nv_rsqrtf.exit + %346 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !45 + br label %__nv_rsqrtf.exit17, !dbg !45 + +__nv_rsqrtf.exit17: ; preds = %343, %345 + %.0.i16 = phi float [ %344, %343 ], [ %346, %345 ], !dbg !45 + %347 = zext nneg i32 %12 to i64, !dbg !46 + %348 = sext i32 %13 to i64, !dbg !46 + %349 = getelementptr bfloat, ptr addrspace(1) %1, i64 %347, !dbg !47 + %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48 + %351 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !48 + %352 = extractvalue { i32, i32 } %351, 0, !dbg !48 + %353 = bitcast i32 %352 to <2 x bfloat>, !dbg !48 + %354 = extractvalue { i32, i32 } %351, 1, !dbg !48 + %355 = bitcast i32 %354 to <2 x bfloat>, !dbg !48 + %356 = or disjoint i64 %347, %348, !dbg !49 + %357 = getelementptr bfloat, ptr addrspace(1) %0, i64 %356, !dbg !50 + %358 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %359 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %357, i64 %358, i1 %9) #6, !dbg !51 + %360 = extractvalue { i32, i32 } %359, 0, !dbg !51 + %361 = bitcast i32 %360 to <2 x bfloat>, !dbg !51 + %362 = extractvalue { i32, i32 } %359, 1, !dbg !51 + %363 = bitcast i32 %362 to <2 x bfloat>, !dbg !51 + %364 = getelementptr bfloat, ptr addrspace(1) %2, i64 %347, !dbg !52 + %365 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53 + %366 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %364, i64 %365, i1 true) #6, !dbg !53 + %367 = extractvalue { i32, i32 } %366, 0, !dbg !53 + %368 = bitcast i32 %367 to <2 x bfloat>, !dbg !53 + %369 = extractvalue { i32, i32 } %366, 1, !dbg !53 + %370 = bitcast i32 %369 to <2 x bfloat>, !dbg !53 + %371 = getelementptr bfloat, ptr addrspace(1) %3, i64 %356, !dbg !54 + %372 = fpext <2 x bfloat> %353 to <2 x float>, !dbg !55 + %373 = fpext <2 x bfloat> %361 to <2 x float>, !dbg !56 + %374 = fpext <2 x bfloat> %368 to <2 x float>, !dbg !57 + %375 = fadd <2 x float> %372, splat (float 1.000000e+00), !dbg !58 + %376 = insertelement <2 x float> poison, float %335, i64 0, !dbg !59 + %377 = shufflevector <2 x float> %376, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !59 + %378 = fsub <2 x float> %373, %377, !dbg !59 + %379 = insertelement <2 x float> poison, float %.0.i16, i64 0, !dbg !60 + %380 = shufflevector <2 x float> %379, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !60 + %381 = fmul <2 x float> %380, %378, !dbg !60 + %382 = fmul <2 x float> %375, %381, !dbg !61 + %383 = fadd <2 x float> %382, %374, !dbg !62 + %384 = fptrunc <2 x float> %383 to <2 x bfloat>, !dbg !63 + %385 = fpext <2 x bfloat> %355 to <2 x float>, !dbg !55 + %386 = fpext <2 x bfloat> %363 to <2 x float>, !dbg !56 + %387 = fpext <2 x bfloat> %370 to <2 x float>, !dbg !57 + %388 = fadd <2 x float> %385, splat (float 1.000000e+00), !dbg !58 + %389 = fsub <2 x float> %386, %377, !dbg !59 + %390 = fmul <2 x float> %380, %389, !dbg !60 + %391 = fmul <2 x float> %388, %390, !dbg !61 + %392 = fadd <2 x float> %391, %387, !dbg !62 + %393 = fptrunc <2 x float> %392 to <2 x bfloat>, !dbg !63 + %394 = bitcast <2 x bfloat> %384 to i32, !dbg !63 + %395 = bitcast <2 x bfloat> %393 to i32, !dbg !63 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %394, i32 %395, ptr addrspace(1) %371, i1 %9) #6, !dbg !63 + %396 = or disjoint i64 %347, 2048, !dbg !64 + %397 = getelementptr bfloat, ptr addrspace(1) %1, i64 %396, !dbg !47 + %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48 + %399 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %397, i64 %398, i1 true) #6, !dbg !48 + %400 = extractvalue { i32, i32 } %399, 0, !dbg !48 + %401 = bitcast i32 %400 to <2 x bfloat>, !dbg !48 + %402 = extractvalue { i32, i32 } %399, 1, !dbg !48 + %403 = bitcast i32 %402 to <2 x bfloat>, !dbg !48 + %404 = or disjoint i64 %396, %348, !dbg !49 + %405 = getelementptr bfloat, ptr addrspace(1) %0, i64 %404, !dbg !50 + %406 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %407 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %405, i64 %406, i1 %9) #6, !dbg !51 + %408 = extractvalue { i32, i32 } %407, 0, !dbg !51 + %409 = bitcast i32 %408 to <2 x bfloat>, !dbg !51 + %410 = extractvalue { i32, i32 } %407, 1, !dbg !51 + %411 = bitcast i32 %410 to <2 x bfloat>, !dbg !51 + %412 = getelementptr bfloat, ptr addrspace(1) %2, i64 %396, !dbg !52 + %413 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53 + %414 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %412, i64 %413, i1 true) #6, !dbg !53 + %415 = extractvalue { i32, i32 } %414, 0, !dbg !53 + %416 = bitcast i32 %415 to <2 x bfloat>, !dbg !53 + %417 = extractvalue { i32, i32 } %414, 1, !dbg !53 + %418 = bitcast i32 %417 to <2 x bfloat>, !dbg !53 + %419 = getelementptr bfloat, ptr addrspace(1) %3, i64 %404, !dbg !54 + %420 = fpext <2 x bfloat> %401 to <2 x float>, !dbg !55 + %421 = fpext <2 x bfloat> %409 to <2 x float>, !dbg !56 + %422 = fpext <2 x bfloat> %416 to <2 x float>, !dbg !57 + %423 = fadd <2 x float> %420, splat (float 1.000000e+00), !dbg !58 + %424 = fsub <2 x float> %421, %377, !dbg !59 + %425 = fmul <2 x float> %380, %424, !dbg !60 + %426 = fmul <2 x float> %423, %425, !dbg !61 + %427 = fadd <2 x float> %426, %422, !dbg !62 + %428 = fptrunc <2 x float> %427 to <2 x bfloat>, !dbg !63 + %429 = fpext <2 x bfloat> %403 to <2 x float>, !dbg !55 + %430 = fpext <2 x bfloat> %411 to <2 x float>, !dbg !56 + %431 = fpext <2 x bfloat> %418 to <2 x float>, !dbg !57 + %432 = fadd <2 x float> %429, splat (float 1.000000e+00), !dbg !58 + %433 = fsub <2 x float> %430, %377, !dbg !59 + %434 = fmul <2 x float> %380, %433, !dbg !60 + %435 = fmul <2 x float> %432, %434, !dbg !61 + %436 = fadd <2 x float> %435, %431, !dbg !62 + %437 = fptrunc <2 x float> %436 to <2 x bfloat>, !dbg !63 + %438 = bitcast <2 x bfloat> %428 to i32, !dbg !63 + %439 = bitcast <2 x bfloat> %437 to i32, !dbg !63 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %438, i32 %439, ptr addrspace(1) %419, i1 %9) #6, !dbg !63 + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 25, column: 21, scope: !5) +!10 = !DILocation(line: 26, column: 37, scope: !5) +!11 = !DILocation(line: 38, column: 46, scope: !5) +!12 = !DILocation(line: 32, column: 43, scope: !5) +!13 = !DILocation(line: 38, column: 34, scope: !5) +!14 = !DILocation(line: 38, column: 51, scope: !5) +!15 = !DILocation(line: 38, column: 112, scope: !5) +!16 = !DILocation(line: 44, column: 62, scope: !5) +!17 = !DILocation(line: 222, column: 24, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !5, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 42, column: 51, scope: !21) +!21 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!22 = !DILocation(line: 46, column: 66, scope: !5) +!23 = !DILocation(line: 224, column: 34, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 224, column: 26, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 225, column: 39, scope: !18, inlinedAt: !20) +!26 = !DILocation(line: 225, column: 31, scope: !18, inlinedAt: !20) +!27 = !DILocation(line: 225, column: 22, scope: !18, inlinedAt: !20) +!28 = !DILocation(line: 45, column: 58, scope: !5) +!29 = !DILocation(line: 231, column: 21, scope: !18, inlinedAt: !30) +!30 = !DILocation(line: 243, column: 46, scope: !18, inlinedAt: !31) +!31 = !DILocation(line: 47, column: 79, scope: !21) +!32 = !DILocation(line: 232, column: 28, scope: !18, inlinedAt: !30) +!33 = !DILocation(line: 233, column: 39, scope: !18, inlinedAt: !30) +!34 = !DILocation(line: 233, column: 60, scope: !18, inlinedAt: !30) +!35 = !DILocation(line: 233, column: 49, scope: !18, inlinedAt: !30) +!36 = !DILocation(line: 235, column: 25, scope: !18, inlinedAt: !30) +!37 = !DILocation(line: 235, column: 17, scope: !18, inlinedAt: !30) +!38 = !DILocation(line: 236, column: 15, scope: !18, inlinedAt: !30) +!39 = !DILocation(line: 236, column: 30, scope: !18, inlinedAt: !30) +!40 = !DILocation(line: 236, column: 38, scope: !18, inlinedAt: !30) +!41 = !DILocation(line: 236, column: 49, scope: !18, inlinedAt: !30) +!42 = !DILocation(line: 236, column: 22, scope: !18, inlinedAt: !30) +!43 = !DILocation(line: 65, column: 24, scope: !5) +!44 = !DILocation(line: 67, column: 24, scope: !5) +!45 = !DILocation(line: 68, column: 32, scope: !5) +!46 = !DILocation(line: 51, column: 43, scope: !5) +!47 = !DILocation(line: 57, column: 34, scope: !5) +!48 = !DILocation(line: 57, column: 41, scope: !5) +!49 = !DILocation(line: 58, column: 42, scope: !5) +!50 = !DILocation(line: 58, column: 35, scope: !5) +!51 = !DILocation(line: 58, column: 52, scope: !5) +!52 = !DILocation(line: 59, column: 35, scope: !5) +!53 = !DILocation(line: 59, column: 42, scope: !5) +!54 = !DILocation(line: 73, column: 29, scope: !5) +!55 = !DILocation(line: 57, column: 94, scope: !5) +!56 = !DILocation(line: 58, column: 114, scope: !5) +!57 = !DILocation(line: 59, column: 95, scope: !5) +!58 = !DILocation(line: 61, column: 23, scope: !5) +!59 = !DILocation(line: 63, column: 24, scope: !5) +!60 = !DILocation(line: 69, column: 24, scope: !5) +!61 = !DILocation(line: 71, column: 24, scope: !5) +!62 = !DILocation(line: 72, column: 24, scope: !5) +!63 = !DILocation(line: 73, column: 53, scope: !5) +!64 = !DILocation(line: 52, column: 31, scope: !5) +!65 = !DILocation(line: 51, column: 4, scope: !5) diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..882724f06d4da7b0ccd0a61943787cb11db32f98 --- /dev/null +++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ptx @@ -0,0 +1,1089 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused_add_mul_native_layer_norm_0 +.visible .entry triton_red_fused_add_mul_native_layer_norm_0( + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_4, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_7 +) +.reqntid 512 +{ + .reg .pred %p<19>; + .reg .b16 %rs<33>; + .reg .b32 %r<282>; + .reg .b64 %rd<28>; + .loc 1 18 0 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd19, [triton_red_fused_add_mul_native_layer_norm_0_param_0]; + ld.param.b64 %rd20, [triton_red_fused_add_mul_native_layer_norm_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:23:28 + mov.u32 %r37, %ctaid.x; + .loc 1 25 21 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:25:21 + setp.lt.u32 %p1, %r37, 2048; + ld.param.b64 %rd21, [triton_red_fused_add_mul_native_layer_norm_0_param_2]; + ld.param.b64 %rd22, [triton_red_fused_add_mul_native_layer_norm_0_param_3]; + .loc 1 26 37 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:26:37 + mov.u32 %r38, %tid.x; + shl.b32 %r39, %r38, 2; + and.b32 %r40, %r39, 2044; + .loc 1 38 46 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:46 + shl.b32 %r41, %r37, 12; + or.b32 %r42, %r40, %r41; + .loc 1 38 34 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:34 + mad.wide.s32 %rd1, %r42, 2, %rd19; + .loc 1 38 51 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:51 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r2; + .loc 1 38 112 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112 + cvt.f32.bf16 %r43, %rs2; + cvt.f32.bf16 %r44, %rs1; + .loc 1 38 51 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:51 + mov.b32 {%rs3, %rs4}, %r1; + .loc 1 38 112 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112 + cvt.f32.bf16 %r45, %rs4; + cvt.f32.bf16 %r46, %rs3; + .loc 1 44 62 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:44:62 + selp.f32 %r47, %r46, 0f00000000, %p1; + selp.f32 %r48, %r45, 0f00000000, %p1; + selp.f32 %r49, %r44, 0f00000000, %p1; + selp.f32 %r50, %r43, 0f00000000, %p1; + .loc 1 38 34 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:34 + add.s64 %rd3, %rd1, 4096; + .loc 1 38 51 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:51 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4, %r3; + mov.u32 %r5, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4; + // end inline asm + mov.b32 {%rs5, %rs6}, %r4; + .loc 1 38 112 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112 + cvt.f32.bf16 %r51, %rs5; +$L__tmp1: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + sub.f32 %r52, %r51, %r47; +$L__tmp2: + .loc 1 46 66 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:46:66 + selp.f32 %r53, 0f40000000, 0f3F800000, %p1; +$L__tmp3: + .loc 2 224 34 // triton_helpers.py:224:34 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + div.full.f32 %r54, %r52, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + add.f32 %r55, %r47, %r54; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + sub.f32 %r56, %r51, %r55; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + fma.rn.f32 %r57, %r52, %r56, 0f00000000; +$L__tmp4: + .loc 1 38 112 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112 + cvt.f32.bf16 %r58, %rs6; +$L__tmp5: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + sub.f32 %r59, %r58, %r48; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + div.full.f32 %r60, %r59, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + add.f32 %r61, %r48, %r60; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + sub.f32 %r62, %r58, %r61; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + fma.rn.f32 %r63, %r59, %r62, 0f00000000; +$L__tmp6: + .loc 1 38 51 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:51 + mov.b32 {%rs7, %rs8}, %r5; + .loc 1 38 112 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112 + cvt.f32.bf16 %r64, %rs7; +$L__tmp7: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + sub.f32 %r65, %r64, %r49; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + div.full.f32 %r66, %r65, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + add.f32 %r67, %r49, %r66; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + sub.f32 %r68, %r64, %r67; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + fma.rn.f32 %r69, %r65, %r68, 0f00000000; +$L__tmp8: + .loc 1 38 112 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112 + cvt.f32.bf16 %r70, %rs8; +$L__tmp9: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + sub.f32 %r71, %r70, %r50; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + div.full.f32 %r72, %r71, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + add.f32 %r73, %r50, %r72; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + sub.f32 %r74, %r70, %r73; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ] + fma.rn.f32 %r75, %r71, %r74, 0f00000000; +$L__tmp10: + .loc 1 44 62 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:44:62 + selp.f32 %r76, %r55, 0f00000000, %p1; + selp.f32 %r77, %r61, 0f00000000, %p1; + selp.f32 %r78, %r67, 0f00000000, %p1; + selp.f32 %r79, %r73, 0f00000000, %p1; + .loc 1 45 58 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:45:58 + selp.f32 %r80, %r69, 0f00000000, %p1; + selp.f32 %r81, %r75, 0f00000000, %p1; + .loc 1 46 66 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:46:66 + selp.f32 %r82, 0f40000000, 0f00000000, %p1; + .loc 1 26 37 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:26:37 + and.b32 %r83, %r38, 511; + and.b32 %r84, %r38, 31; +$L__tmp11: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r85, %r77, %r76; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r86, 0f40800000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p6, %r86, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r87, %r82, %r86; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r88, 0f00000000, %r87, %p6; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r89, %r85, %r88, %r76; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r90, %r57, %r63; + selp.f32 %r91, %r90, 0f00000000, %p1; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r92, %r85, %r85; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r93, %r92, %r82; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r94, %r93, %r88, %r91; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r95, %r78, %r89; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r96, 0f40C00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p7, %r96, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r97, %r82, %r96; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r98, 0f00000000, %r97, %p7; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r99, %r98, %r95, %r89; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r100, %r80, %r94; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r101, %r95, %r95; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r102, %r86, %r101; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r103, %r98, %r102, %r100; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r104, %r79, %r99; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r105, 0f41000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p8, %r105, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r106, %r82, %r105; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r107, 0f00000000, %r106, %p8; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r108, %r107, %r104, %r99; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r109, %r81, %r103; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r110, %r104, %r104; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r111, %r96, %r110; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r112, %r107, %r111, %r109; +$L__tmp12: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r113, %r108, 16, 31, -1; + shfl.sync.bfly.b32 %r114, %r112, 16, 31, -1; + shfl.sync.bfly.b32 %r115, %r105, 16, 31, -1; +$L__tmp13: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r116, %r113, %r108; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r117, %r105, %r115; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p9, %r117, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r118, %r115, %r117; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r119, 0f00000000, %r118, %p9; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r120, %r119, %r116, %r108; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r121, %r112, %r114; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r122, %r116, %r116; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r123, %r105, %r122; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r124, %r119, %r123, %r121; +$L__tmp14: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r125, %r120, 8, 31, -1; + shfl.sync.bfly.b32 %r126, %r124, 8, 31, -1; + shfl.sync.bfly.b32 %r127, %r117, 8, 31, -1; +$L__tmp15: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r128, %r125, %r120; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r129, %r117, %r127; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p10, %r129, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r130, %r127, %r129; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r131, 0f00000000, %r130, %p10; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r132, %r128, %r131, %r120; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r133, %r124, %r126; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r134, %r128, %r128; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r135, %r117, %r134; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r136, %r131, %r135, %r133; +$L__tmp16: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r137, %r132, 4, 31, -1; + shfl.sync.bfly.b32 %r138, %r136, 4, 31, -1; + shfl.sync.bfly.b32 %r139, %r129, 4, 31, -1; +$L__tmp17: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r140, %r137, %r132; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r141, %r129, %r139; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p11, %r141, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r142, %r139, %r141; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r143, 0f00000000, %r142, %p11; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r144, %r140, %r143, %r132; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r145, %r136, %r138; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r146, %r140, %r140; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r147, %r129, %r146; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r148, %r143, %r147, %r145; +$L__tmp18: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r149, %r144, 2, 31, -1; + shfl.sync.bfly.b32 %r150, %r148, 2, 31, -1; + shfl.sync.bfly.b32 %r151, %r141, 2, 31, -1; +$L__tmp19: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r152, %r149, %r144; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r153, %r141, %r151; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p12, %r153, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r154, %r151, %r153; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r155, 0f00000000, %r154, %p12; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r156, %r152, %r155, %r144; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r157, %r148, %r150; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r158, %r152, %r152; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r159, %r141, %r158; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r160, %r155, %r159, %r157; +$L__tmp20: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r161, %r156, 1, 31, -1; + shfl.sync.bfly.b32 %r162, %r160, 1, 31, -1; + shfl.sync.bfly.b32 %r163, %r153, 1, 31, -1; +$L__tmp21: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r164, %r161, %r156; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r11, %r153, %r163; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p13, %r11, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r165, %r163, %r11; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r166, 0f00000000, %r165, %p13; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r7, %r164, %r166, %r156; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r167, %r160, %r162; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r168, %r164, %r164; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r169, %r153, %r168; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r9, %r166, %r169, %r167; +$L__tmp22: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + setp.eq.b32 %p2, %r84, 0; + shr.u32 %r170, %r38, 3; + and.b32 %r171, %r170, 60; + mov.b32 %r172, global_smem; + add.s32 %r6, %r172, %r171; + // begin inline asm + @%p2 st.shared.b32 [ %r6 + 0 ], %r7; + // end inline asm + add.s32 %r8, %r6, 64; + // begin inline asm + @%p2 st.shared.b32 [ %r8 + 0 ], %r9; + // end inline asm + add.s32 %r10, %r6, 128; + // begin inline asm + @%p2 st.shared.b32 [ %r10 + 0 ], %r11; + // end inline asm + bar.sync 0; + setp.lt.u32 %p3, %r83, 16; + shl.b32 %r173, %r83, 2; + add.s32 %r13, %r172, %r173; + // begin inline asm + @%p3 ld.shared.b32 %r12, [ %r13 + 0 ]; + // end inline asm + add.s32 %r15, %r13, 64; + // begin inline asm + @%p3 ld.shared.b32 %r14, [ %r15 + 0 ]; + // end inline asm + add.s32 %r17, %r13, 128; + // begin inline asm + @%p3 ld.shared.b32 %r16, [ %r17 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r174, %r12, 8, 31, -1; + shfl.sync.bfly.b32 %r175, %r14, 8, 31, -1; + shfl.sync.bfly.b32 %r176, %r16, 8, 31, -1; +$L__tmp23: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r177, %r174, %r12; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r178, %r16, %r176; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p14, %r178, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r179, %r176, %r178; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r180, 0f00000000, %r179, %p14; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r181, %r177, %r180, %r12; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r182, %r14, %r175; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r183, %r177, %r177; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r184, %r183, %r16; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r185, %r184, %r180, %r182; +$L__tmp24: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r186, %r181, 4, 31, -1; + shfl.sync.bfly.b32 %r187, %r185, 4, 31, -1; + shfl.sync.bfly.b32 %r188, %r178, 4, 31, -1; +$L__tmp25: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r189, %r186, %r181; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r190, %r178, %r188; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p15, %r190, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r191, %r188, %r190; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r192, 0f00000000, %r191, %p15; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r193, %r189, %r192, %r181; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r194, %r185, %r187; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r195, %r189, %r189; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r196, %r178, %r195; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r197, %r192, %r196, %r194; +$L__tmp26: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r198, %r193, 2, 31, -1; + shfl.sync.bfly.b32 %r199, %r197, 2, 31, -1; + shfl.sync.bfly.b32 %r200, %r190, 2, 31, -1; +$L__tmp27: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r201, %r198, %r193; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r202, %r190, %r200; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p16, %r202, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r203, %r200, %r202; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r204, 0f00000000, %r203, %p16; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r205, %r201, %r204, %r193; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r206, %r197, %r199; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r207, %r201, %r201; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r208, %r190, %r207; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r209, %r204, %r208, %r206; +$L__tmp28: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r210, %r205, 1, 31, -1; + shfl.sync.bfly.b32 %r211, %r209, 1, 31, -1; + shfl.sync.bfly.b32 %r212, %r202, 1, 31, -1; +$L__tmp29: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r213, %r210, %r205; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r20, %r202, %r212; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p17, %r20, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r214, %r212, %r20; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r215, 0f00000000, %r214, %p17; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r18, %r213, %r215, %r205; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r216, %r209, %r211; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r217, %r213, %r213; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r218, %r202, %r217; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r19, %r215, %r218, %r216; +$L__tmp30: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + and.b32 %r219, %r38, 15; + setp.eq.b32 %p18, %r219, 0; + and.pred %p4, %p3, %p18; + // begin inline asm + @%p4 st.shared.b32 [ %r13 + 0 ], %r18; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r15 + 0 ], %r19; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r17 + 0 ], %r20; + // end inline asm + bar.sync 0; + ld.shared.b32 %r220, [global_smem]; + ld.shared.b32 %r221, [global_smem+64]; + mov.b32 %r222, 0f45800000; +$L__tmp31: + .loc 1 65 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:65:24 + div.full.f32 %r223, %r221, %r222; + .loc 1 67 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:67:24 + add.f32 %r224, %r223, 0f358637BD; + .loc 1 68 32 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:68:32 + rsqrt.approx.ftz.f32 %r225, %r224; + .loc 1 51 43 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:51:43 + cvt.u64.u32 %rd23, %r40; + cvt.s64.s32 %rd24, %r41; + .loc 1 57 34 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:34 + mul.wide.u32 %rd25, %r40, 2; + add.s64 %rd5, %rd20, %rd25; + .loc 1 57 41 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:41 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + mov.pred %p5, -1; + // begin inline asm + mov.u32 %r21, %r3; + mov.u32 %r22, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r21, %r22 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 58 42 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:42 + or.b64 %rd26, %rd23, %rd24; + .loc 1 58 35 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:35 + shl.b64 %rd27, %rd26, 1; + add.s64 %rd7, %rd19, %rd27; + .loc 1 58 52 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:52 + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r23, %r3; + mov.u32 %r24, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r23, %r24 }, [ %rd7 + 0 ], %rd8; + // end inline asm + .loc 1 59 35 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:35 + add.s64 %rd9, %rd21, %rd25; + .loc 1 59 42 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:42 + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r3; + mov.u32 %r26, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r25, %r26 }, [ %rd9 + 0 ], %rd10; + // end inline asm + .loc 1 73 29 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:29 + add.s64 %rd11, %rd22, %rd27; + .loc 1 57 94 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94 + mov.b32 {%rs9, %rs10}, %r21; + cvt.f32.bf16 %r226, %rs9; + cvt.f32.bf16 %r227, %rs10; + .loc 1 58 114 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114 + mov.b32 {%rs11, %rs12}, %r23; + cvt.f32.bf16 %r228, %rs12; + cvt.f32.bf16 %r229, %rs11; + .loc 1 59 95 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95 + mov.b32 {%rs13, %rs14}, %r25; + cvt.f32.bf16 %r230, %rs14; + cvt.f32.bf16 %r231, %rs13; + .loc 1 61 23 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23 + add.f32 %r232, %r227, 0f3F800000; + add.f32 %r233, %r226, 0f3F800000; + .loc 1 63 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24 + sub.f32 %r234, %r229, %r220; + sub.f32 %r235, %r228, %r220; + .loc 1 69 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24 + mul.f32 %r236, %r225, %r235; + mul.f32 %r237, %r225, %r234; + .loc 1 72 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24 + fma.rn.f32 %r238, %r233, %r237, %r231; + fma.rn.f32 %r239, %r232, %r236, %r230; + .loc 1 73 53 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53 + cvt.rn.bf16x2.f32 %r27, %r239, %r238; + .loc 1 57 94 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94 + mov.b32 {%rs15, %rs16}, %r22; + cvt.f32.bf16 %r240, %rs15; + cvt.f32.bf16 %r241, %rs16; + .loc 1 58 114 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114 + mov.b32 {%rs17, %rs18}, %r24; + cvt.f32.bf16 %r242, %rs18; + cvt.f32.bf16 %r243, %rs17; + .loc 1 59 95 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95 + mov.b32 {%rs19, %rs20}, %r26; + cvt.f32.bf16 %r244, %rs20; + cvt.f32.bf16 %r245, %rs19; + .loc 1 61 23 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23 + add.f32 %r246, %r241, 0f3F800000; + add.f32 %r247, %r240, 0f3F800000; + .loc 1 63 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24 + sub.f32 %r248, %r243, %r220; + sub.f32 %r249, %r242, %r220; + .loc 1 69 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24 + mul.f32 %r250, %r225, %r249; + mul.f32 %r251, %r225, %r248; + .loc 1 72 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24 + fma.rn.f32 %r252, %r247, %r251, %r245; + fma.rn.f32 %r253, %r246, %r250, %r244; + .loc 1 73 53 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53 + cvt.rn.bf16x2.f32 %r28, %r253, %r252; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd11 + 0 ], { %r27, %r28 }; + // end inline asm + .loc 1 57 34 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:34 + add.s64 %rd12, %rd5, 4096; + .loc 1 57 41 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:41 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r29, %r3; + mov.u32 %r30, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r29, %r30 }, [ %rd12 + 0 ], %rd13; + // end inline asm + .loc 1 58 35 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:35 + add.s64 %rd14, %rd7, 4096; + .loc 1 58 52 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:52 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r31, %r3; + mov.u32 %r32, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r31, %r32 }, [ %rd14 + 0 ], %rd15; + // end inline asm + .loc 1 59 35 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:35 + add.s64 %rd16, %rd9, 4096; + .loc 1 59 42 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:42 + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r33, %r3; + mov.u32 %r34, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd16 + 0 ], %rd17; + // end inline asm + .loc 1 73 29 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:29 + add.s64 %rd18, %rd11, 4096; + .loc 1 57 94 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94 + mov.b32 {%rs21, %rs22}, %r29; + cvt.f32.bf16 %r254, %rs21; + cvt.f32.bf16 %r255, %rs22; + .loc 1 58 114 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114 + mov.b32 {%rs23, %rs24}, %r31; + cvt.f32.bf16 %r256, %rs24; + cvt.f32.bf16 %r257, %rs23; + .loc 1 59 95 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95 + mov.b32 {%rs25, %rs26}, %r33; + cvt.f32.bf16 %r258, %rs26; + cvt.f32.bf16 %r259, %rs25; + .loc 1 61 23 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23 + add.f32 %r260, %r255, 0f3F800000; + add.f32 %r261, %r254, 0f3F800000; + .loc 1 63 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24 + sub.f32 %r262, %r257, %r220; + sub.f32 %r263, %r256, %r220; + .loc 1 69 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24 + mul.f32 %r264, %r225, %r263; + mul.f32 %r265, %r225, %r262; + .loc 1 72 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24 + fma.rn.f32 %r266, %r261, %r265, %r259; + fma.rn.f32 %r267, %r260, %r264, %r258; + .loc 1 73 53 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53 + cvt.rn.bf16x2.f32 %r35, %r267, %r266; + .loc 1 57 94 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94 + mov.b32 {%rs27, %rs28}, %r30; + cvt.f32.bf16 %r268, %rs27; + cvt.f32.bf16 %r269, %rs28; + .loc 1 58 114 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114 + mov.b32 {%rs29, %rs30}, %r32; + cvt.f32.bf16 %r270, %rs30; + cvt.f32.bf16 %r271, %rs29; + .loc 1 59 95 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95 + mov.b32 {%rs31, %rs32}, %r34; + cvt.f32.bf16 %r272, %rs32; + cvt.f32.bf16 %r273, %rs31; + .loc 1 61 23 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23 + add.f32 %r274, %r269, 0f3F800000; + add.f32 %r275, %r268, 0f3F800000; + .loc 1 63 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24 + sub.f32 %r276, %r271, %r220; + sub.f32 %r277, %r270, %r220; + .loc 1 69 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24 + mul.f32 %r278, %r225, %r277; + mul.f32 %r279, %r225, %r276; + .loc 1 72 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24 + fma.rn.f32 %r280, %r275, %r279, %r273; + fma.rn.f32 %r281, %r274, %r278, %r272; + .loc 1 73 53 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53 + cvt.rn.bf16x2.f32 %r36, %r281, %r280; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd18 + 0 ], { %r35, %r36 }; + // end inline asm + .loc 1 51 4 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:51:4 + ret; +$L__tmp32: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 367 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x168 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 119 +.b8 105 +.b8 122 +.b8 122 +.b8 106 +.b8 119 +.b8 109 +.b8 100 +.b8 52 +.b8 97 +.b8 106 +.b8 108 +.b8 117 +.b8 98 +.b8 120 +.b8 112 +.b8 118 +.b8 120 +.b8 105 +.b8 100 +.b8 106 +.b8 105 +.b8 121 +.b8 51 +.b8 108 +.b8 100 +.b8 118 +.b8 53 +.b8 101 +.b8 102 +.b8 108 +.b8 119 +.b8 108 +.b8 117 +.b8 100 +.b8 103 +.b8 105 +.b8 122 +.b8 99 +.b8 97 +.b8 104 +.b8 118 +.b8 115 +.b8 112 +.b8 52 +.b8 105 +.b8 55 +.b8 53 +.b8 115 +.b8 50 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 119 +.b8 119 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 97 +.b8 116 +.b8 105 +.b8 118 +.b8 101 +.b8 95 +.b8 108 +.b8 97 +.b8 121 +.b8 101 +.b8 114 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x113:0x5f DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x128:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 51 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x140:0x31 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp11 // DW_AT_low_pc +.b64 $L__tmp31 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 47 // DW_AT_call_line +.b8 79 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x158:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp11 // DW_AT_low_pc +.b64 $L__tmp30 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 243 // DW_AT_call_line +.b8 46 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.source new file mode 100644 index 0000000000000000000000000000000000000000..83e17ff3f38d522bca1175f7911c73a4f62e97cf --- /dev/null +++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.source @@ -0,0 +1,420 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0) +#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0) +#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0) +#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0) +#loc91 = loc(unknown) +#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0) +#loc109 = loc("in_ptr0"(#loc)) +#loc110 = loc("in_ptr1"(#loc)) +#loc111 = loc("in_ptr2"(#loc)) +#loc112 = loc("out_ptr2"(#loc)) +#loc113 = loc("xnumel"(#loc)) +#loc114 = loc("r0_numel"(#loc)) +#loc171 = loc("value"(#loc72)) +#loc172 = loc("mean"(#loc72)) +#loc173 = loc("m2"(#loc72)) +#loc174 = loc("weight"(#loc72)) +#loc175 = loc("first_iteration"(#loc72)) +#loc185 = loc("input"(#loc85)) +#loc186 = loc("mean"(#loc89)) +#loc187 = loc("m2"(#loc89)) +#loc188 = loc("weight"(#loc89)) +#loc189 = loc("mean_1"(#loc94)) +#loc190 = loc("m2_1"(#loc94)) +#loc191 = loc("weight_1"(#loc94)) +#loc192 = loc("mean_2"(#loc94)) +#loc193 = loc("m2_2"(#loc94)) +#loc194 = loc("weight_2"(#loc94)) +#loc201 = loc("new_mean"(#loc171)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2048 : i32 loc(#loc115) + %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116) + %xoffset = tt.get_program_id x : i32 loc(#loc117) + %xoffset_2 = arith.constant 1 : i32 loc(#loc118) + %xoffset_3 = arith.constant 1 : i32 loc(#loc118) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121) + %xmask = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc122) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc123) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc124) + %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc125) + %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc126) + %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc127) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc129) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc129) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc130) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc130) + %tmp0 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_18 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc132) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc132) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc133) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc133) + %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc134) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc134) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc135) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc135) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc135) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc136) + %c0_i32_32 = arith.constant 0 : i32 loc(#loc23) + %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23) + %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, i1) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc24) + %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc137) + %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x2048xi1> loc(#loc137) + %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc138) + %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc139) + %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x2048xi1> loc(#loc139) + %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc140) + %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc141) + %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x2048xi1> loc(#loc141) + %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc142) + scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc31) + } loc(#loc207) + %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143) + %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144) + %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc36) + %c2048_i32_12 = arith.constant 2048 : i32 loc(#loc36) + %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36) + %7 = arith.bitcast %c2048_i32_12 : i32 to i32 loc(#loc36) + %8 = ub.poison : i32 loc(#loc36) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc146) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc146) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc147) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc147) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc148) + %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc148) + %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149) + %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc149) + %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc149) + %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc149) + %tmp9_20 = arith.extf %tmp9_19 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc150) + %tmp12 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_21 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151) + %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151) + %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc152) + %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x2048xi32> loc(#loc152) + %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc153) + %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc153) + %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc154) + %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x2048xi1> loc(#loc154) + %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc155) + %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc155) + %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc155) + %tmp12_34 = arith.extf %tmp12_33 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc156) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc157) + %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc157) + %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc158) + %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc158) + %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc158) + %tmp23_40 = arith.extf %tmp23_39 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc159) + %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160) + %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc161) + %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x2048xf32> loc(#loc161) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc162) + %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x2048xf32> loc(#loc162) + %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163) + %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164) + %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164) + %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165) + %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166) + %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166) + %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc168) + %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x2048xf32> loc(#loc168) + %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x2048xf32> loc(#loc169) + %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x2048xf32> loc(#loc170) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc62) + %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc63) + %11 = arith.addi %r0_index_13, %10 : tensor<1x2048xi32> loc(#loc63) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc64) + %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc64) + %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc65) + %15 = arith.andi %r0_mask_14, %14 : tensor<1x2048xi1> loc(#loc65) + %16 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc66) + tt.store %13, %16, %15 : tensor<1x2048x!tt.ptr> loc(#loc66) + } loc(#loc36) + tt.return loc(#loc67) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc69) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc69) + tt.return %cst_0 : tensor<1x2048xf32> loc(#loc70) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc71) + tt.return %0 : tensor<1x2048xf32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%new_mean: tensor<1x2048xf32> loc("new_mean"(#loc171)), %mean: tensor<1x2048xf32> loc("mean"(#loc72)), %m2: tensor<1x2048xf32> loc("m2"(#loc72)), %weight: tensor<1x2048xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} { + %0:3 = scf.if %first_iteration -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) { + %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176) + %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc202) + %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_2048S__(%m2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc203) + scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc203) + } else { + %delta = arith.subf %new_mean, %mean : tensor<1x2048xf32> loc(#loc178) + %new_weight = arith.constant 1 : i32 loc(#loc179) + %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179) + %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc179) + %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x2048xf32> loc(#loc204) + %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x2048xf32> loc(#loc180) + %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x2048xf32> loc(#loc205) + %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x2048xf32> loc(#loc182) + %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc183) + %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x2048xf32> loc(#loc206) + scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc184) + } loc(#loc73) + tt.return %0#1, %0#0, %0#2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc84) + %2 = ub.poison : tensor<1x2048xf32> loc(#loc84) + %3 = ub.poison : tensor<1x2048xf32> loc(#loc84) + tt.return %1, %2, %3 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc84) + } loc(#loc72) + tt.func private @triton.language.standard.zeros_like__fp32S1_2048S__(%input: tensor<1x2048xf32> loc("input"(#loc85))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc86) + tt.return %0 : tensor<1x2048xf32> loc(#loc87) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc88) + tt.return %1 : tensor<1x2048xf32> loc(#loc88) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%mean: tensor<1x2048xf32> loc("mean"(#loc89)), %m2: tensor<1x2048xf32> loc("m2"(#loc89)), %weight: tensor<1x2048xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({ + ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): + %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90) + tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90) + }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90) + tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc93) + %2 = ub.poison : tensor<1xf32> loc(#loc93) + %3 = ub.poison : tensor<1xf32> loc(#loc93) + tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93) + } loc(#loc89) + tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} { + %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195) + %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196) + %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197) + %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197) + %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198) + %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199) + %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100) + %1 = arith.addf %mean_1, %0 : f32 loc(#loc101) + %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102) + %3 = arith.mulf %delta, %delta : f32 loc(#loc103) + %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104) + %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105) + %6 = arith.addf %2, %5 : f32 loc(#loc106) + tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107) + ^bb1: // no predecessors + %7 = ub.poison : f32 loc(#loc108) + %8 = ub.poison : f32 loc(#loc108) + %9 = ub.poison : f32 loc(#loc108) + tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108) + } loc(#loc94) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":29:45) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":30:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":31:47) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:62) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:51) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:37) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:58) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:41) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:8) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":50:16) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:43) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":52:31) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":53:29) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:47) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:42) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:62) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":60:16) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":64:16) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":66:16) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:41) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:36) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:63) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4) +#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0) +#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31) +#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11) +#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4) +#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46) +#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11) +#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4) +#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30) +#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11) +#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4) +#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11) +#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4) +#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11) +#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4) +#loc115 = loc("xnumel"(#loc1)) +#loc116 = loc("r0_numel"(#loc2)) +#loc117 = loc("xoffset"(#loc3)) +#loc118 = loc("xoffset"(#loc4)) +#loc119 = loc("xindex"(#loc5)) +#loc120 = loc("xindex"(#loc6)) +#loc121 = loc("xindex"(#loc7)) +#loc122 = loc("xmask"(#loc8)) +#loc123 = loc("r0_base"(#loc9)) +#loc124 = loc("r0_base"(#loc10)) +#loc125 = loc("tmp3_mean"(#loc11)) +#loc126 = loc("tmp3_m2"(#loc12)) +#loc127 = loc("tmp3_weight"(#loc13)) +#loc128 = loc("tmp3_mean"(#loc14)) +#loc129 = loc("r0_index"(#loc15)) +#loc130 = loc("r0_mask"(#loc16)) +#loc131 = loc("tmp0"(#loc17)) +#loc132 = loc("tmp0"(#loc18)) +#loc133 = loc("tmp0"(#loc19)) +#loc134 = loc("tmp0"(#loc20)) +#loc135 = loc("tmp0"(#loc21)) +#loc136 = loc("tmp0"(#loc22)) +#loc137 = loc("tmp3_mean"(#loc25)) +#loc138 = loc("tmp3_mean"(#loc26)) +#loc139 = loc("tmp3_m2"(#loc27)) +#loc140 = loc("tmp3_m2"(#loc28)) +#loc141 = loc("tmp3_weight"(#loc29)) +#loc142 = loc("tmp3_weight"(#loc30)) +#loc143 = loc("tmp3"(#loc33)) +#loc144 = loc("tmp7"(#loc34)) +#loc145 = loc("tmp8"(#loc35)) +#loc146 = loc("r0_index"(#loc37)) +#loc147 = loc("r0_mask"(#loc38)) +#loc148 = loc("tmp9"(#loc39)) +#loc149 = loc("tmp9"(#loc40)) +#loc150 = loc("tmp9"(#loc41)) +#loc151 = loc("tmp12"(#loc42)) +#loc152 = loc("tmp12"(#loc43)) +#loc153 = loc("tmp12"(#loc44)) +#loc154 = loc("tmp12"(#loc45)) +#loc155 = loc("tmp12"(#loc46)) +#loc156 = loc("tmp12"(#loc47)) +#loc157 = loc("tmp23"(#loc48)) +#loc158 = loc("tmp23"(#loc49)) +#loc159 = loc("tmp23"(#loc50)) +#loc160 = loc("tmp10"(#loc51)) +#loc161 = loc("tmp11"(#loc52)) +#loc162 = loc("tmp14"(#loc53)) +#loc163 = loc("tmp15"(#loc54)) +#loc164 = loc("tmp16"(#loc55)) +#loc165 = loc("tmp17"(#loc56)) +#loc166 = loc("tmp18"(#loc57)) +#loc167 = loc("tmp19"(#loc58)) +#loc168 = loc("tmp20"(#loc59)) +#loc169 = loc("tmp22"(#loc60)) +#loc170 = loc("tmp24"(#loc61)) +#loc176 = loc("new_weight"(#loc74)) +#loc177 = loc("new_m2"(#loc75)) +#loc178 = loc("delta"(#loc76)) +#loc179 = loc("new_weight"(#loc77)) +#loc180 = loc("new_mean"(#loc78)) +#loc181 = loc("new_mean"(#loc79)) +#loc182 = loc("new_m2"(#loc80)) +#loc183 = loc("new_m2"(#loc81)) +#loc184 = loc("new_m2"(#loc82)) +#loc195 = loc("delta"(#loc95)) +#loc196 = loc("new_weight"(#loc96)) +#loc197 = loc("w2_over_w"(#loc97)) +#loc198 = loc("w2_over_w"(#loc98)) +#loc199 = loc("w2_over_w"(#loc99)) +#loc200 = loc("tmp3_m2"(#loc128)) +#loc202 = loc("new_weight"(#loc176)) +#loc203 = loc("new_m2"(#loc177)) +#loc204 = loc("new_weight"(#loc179)) +#loc205 = loc("new_mean"(#loc181)) +#loc206 = loc("new_m2"(#loc184)) +#loc207 = loc("tmp3_weight"(#loc200)) diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..872c8277b54b268d64b927dc36da234a1604e509 --- /dev/null +++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir @@ -0,0 +1,260 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("out_ptr2"(#loc)) +#loc74 = loc("xnumel"(#loc)) +#loc75 = loc("r0_numel"(#loc)) +#loc101 = loc(callsite(#loc1 at #loc30)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1x2048xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc77) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc78) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc78) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc79) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc130) + %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc81) + %tmp0_9 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc131) + %tmp3_weight:3 = scf.for %tmp3_weight_10 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg7 = %cst_2, %arg8 = %cst_2, %arg9 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) : i32 { + %r0_index = tt.splat %tmp3_weight_10 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc84) + %r0_index_11 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc84) + %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc85) + %tmp0_12 = arith.addi %r0_index_11, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc80) + %tmp0_13 = tt.addptr %tmp0_8, %tmp0_12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc81) + %tmp0_14 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc82) + %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc86) + %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc87) + %2 = arith.cmpi eq, %tmp3_weight_10, %c0_i32 : i32 loc(#loc14) + %3:3 = scf.if %2 -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) { + scf.yield %cst_2, %tmp0_16, %cst_5 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc155) + } else { + %delta = arith.subf %tmp0_16, %arg7 : tensor<1x2048xf32, #blocked> loc(#loc134) + %new_weight = arith.addf %arg9, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc156) + %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32, #blocked> loc(#loc136) + %new_mean_18 = arith.addf %arg7, %new_mean : tensor<1x2048xf32, #blocked> loc(#loc157) + %new_m2 = arith.subf %tmp0_16, %new_mean_18 : tensor<1x2048xf32, #blocked> loc(#loc138) + %new_m2_19 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32, #blocked> loc(#loc139) + %new_m2_20 = arith.addf %arg8, %new_m2_19 : tensor<1x2048xf32, #blocked> loc(#loc158) + scf.yield %new_m2_20, %new_mean_18, %new_weight : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc141) + } loc(#loc88) + %tmp3_mean = arith.select %tmp0_14, %3#1, %arg7 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc97) + %tmp3_m2 = arith.select %tmp0_14, %3#0, %arg8 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc98) + %tmp3_weight_17 = arith.select %tmp0_14, %3#2, %arg9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc99) + scf.yield %tmp3_mean, %tmp3_m2, %tmp3_weight_17 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc28) + } loc(#loc154) + %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc30)), %arg7: f32 loc(callsite(#loc1 at #loc30)), %arg8: f32 loc(callsite(#loc1 at #loc30)), %arg9: f32 loc(callsite(#loc1 at #loc30)), %arg10: f32 loc(callsite(#loc1 at #loc30)), %arg11: f32 loc(callsite(#loc1 at #loc30))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc142) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc143) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc144) + %w2_over_w_10 = arith.divf %arg11, %new_weight : f32 loc(#loc145) + %w2_over_w_11 = arith.select %w2_over_w, %cst_1, %w2_over_w_10 : f32 loc(#loc146) + %2 = arith.mulf %delta, %w2_over_w_11 : f32 loc(#loc147) + %3 = arith.addf %arg6, %2 : f32 loc(#loc148) + %4 = arith.addf %arg7, %arg10 : f32 loc(#loc149) + %5 = arith.mulf %delta, %delta : f32 loc(#loc150) + %6 = arith.mulf %5, %arg8 : f32 loc(#loc151) + %7 = arith.mulf %6, %w2_over_w_11 : f32 loc(#loc152) + %8 = arith.addf %4, %7 : f32 loc(#loc153) + tt.reduce.return %3, %8, %new_weight : f32, f32, f32 loc(#loc100) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc100) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc107) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc108) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc109) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc110) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc111) + %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc112) + %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc113) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc114) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc115) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc52) + scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc116) + %r0_index_10 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc116) + %r0_mask = arith.cmpi slt, %r0_index_10, %cst : tensor<1x2048xi32, #blocked> loc(#loc117) + %tmp9_11 = tt.addptr %tmp9, %r0_index_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc109) + %tmp9_12 = tt.load %tmp9_11, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc118) + %tmp9_13 = arith.extf %tmp9_12 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc119) + %tmp12 = arith.addi %r0_index_10, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc120) + %tmp12_14 = tt.addptr %tmp0_8, %tmp12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc121) + %tmp12_15 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc122) + %tmp12_16 = tt.load %tmp12_14, %tmp12_15, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc123) + %tmp12_17 = arith.extf %tmp12_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc124) + %tmp23_18 = tt.addptr %tmp23, %r0_index_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc110) + %tmp23_19 = tt.load %tmp23_18, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc125) + %tmp23_20 = arith.extf %tmp23_19 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc126) + %tmp11 = arith.addf %tmp9_13, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc127) + %tmp14_21 = arith.subf %tmp12_17, %tmp14 : tensor<1x2048xf32, #blocked> loc(#loc111) + %tmp20_22 = arith.mulf %tmp14_21, %tmp20 : tensor<1x2048xf32, #blocked> loc(#loc115) + %tmp22 = arith.mulf %tmp11, %tmp20_22 : tensor<1x2048xf32, #blocked> loc(#loc128) + %tmp24 = arith.addf %tmp22, %tmp23_20 : tensor<1x2048xf32, #blocked> loc(#loc129) + %2 = tt.addptr %1, %tmp12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc52) + %3 = arith.truncf %tmp24 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc68) + tt.store %2, %3, %tmp12_15 : tensor<1x2048x!tt.ptr, #blocked> loc(#loc68) + } loc(#loc53) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":32:43) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":33:31) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:62) +#loc15 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:51) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:58) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:8) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:43) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":52:31) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":53:29) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:42) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:35) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:62) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4) +#loc76 = loc("xoffset"(#loc2)) +#loc77 = loc("xmask"(#loc3)) +#loc78 = loc("r0_base"(#loc4)) +#loc79 = loc("tmp0"(#loc5)) +#loc80 = loc("tmp0"(#loc6)) +#loc81 = loc("tmp0"(#loc7)) +#loc82 = loc("tmp0"(#loc8)) +#loc83 = loc("tmp3_mean"(#loc9)) +#loc84 = loc("r0_index"(#loc10)) +#loc85 = loc("r0_mask"(#loc11)) +#loc86 = loc("tmp0"(#loc12)) +#loc87 = loc("tmp0"(#loc13)) +#loc88 = loc(callsite(#loc15 at #loc16)) +#loc89 = loc("new_m2"(#loc17)) +#loc90 = loc("delta"(#loc18)) +#loc91 = loc("new_weight"(#loc19)) +#loc92 = loc("new_mean"(#loc20)) +#loc93 = loc("new_mean"(#loc21)) +#loc94 = loc("new_m2"(#loc22)) +#loc95 = loc("new_m2"(#loc23)) +#loc96 = loc("new_m2"(#loc24)) +#loc97 = loc("tmp3_mean"(#loc25)) +#loc98 = loc("tmp3_m2"(#loc26)) +#loc99 = loc("tmp3_weight"(#loc27)) +#loc100 = loc(callsite(#loc29 at #loc30)) +#loc102 = loc("delta"(#loc31)) +#loc103 = loc("new_weight"(#loc32)) +#loc104 = loc("w2_over_w"(#loc33)) +#loc105 = loc("w2_over_w"(#loc34)) +#loc106 = loc("w2_over_w"(#loc35)) +#loc107 = loc("tmp3"(#loc43)) +#loc108 = loc("tmp7"(#loc44)) +#loc109 = loc("tmp9"(#loc45)) +#loc110 = loc("tmp23"(#loc46)) +#loc111 = loc("tmp14"(#loc47)) +#loc112 = loc("tmp16"(#loc48)) +#loc113 = loc("tmp18"(#loc49)) +#loc114 = loc("tmp19"(#loc50)) +#loc115 = loc("tmp20"(#loc51)) +#loc116 = loc("r0_index"(#loc54)) +#loc117 = loc("r0_mask"(#loc55)) +#loc118 = loc("tmp9"(#loc56)) +#loc119 = loc("tmp9"(#loc57)) +#loc120 = loc("tmp12"(#loc58)) +#loc121 = loc("tmp12"(#loc59)) +#loc122 = loc("tmp12"(#loc60)) +#loc123 = loc("tmp12"(#loc61)) +#loc124 = loc("tmp12"(#loc62)) +#loc125 = loc("tmp23"(#loc63)) +#loc126 = loc("tmp23"(#loc64)) +#loc127 = loc("tmp11"(#loc65)) +#loc128 = loc("tmp22"(#loc66)) +#loc129 = loc("tmp24"(#loc67)) +#loc130 = loc(fused[#loc80, #loc79]) +#loc131 = loc(fused[#loc82, #loc77]) +#loc132 = loc("tmp3_m2"(#loc83)) +#loc133 = loc("new_m2"(#loc89)) +#loc134 = loc(callsite(#loc90 at #loc16)) +#loc135 = loc("new_weight"(#loc91)) +#loc136 = loc(callsite(#loc92 at #loc16)) +#loc137 = loc("new_mean"(#loc93)) +#loc138 = loc(callsite(#loc94 at #loc16)) +#loc139 = loc(callsite(#loc95 at #loc16)) +#loc140 = loc("new_m2"(#loc96)) +#loc141 = loc(callsite(#loc96 at #loc16)) +#loc142 = loc(callsite(#loc102 at #loc100)) +#loc143 = loc(callsite(#loc103 at #loc100)) +#loc144 = loc(callsite(#loc104 at #loc100)) +#loc145 = loc(callsite(#loc105 at #loc100)) +#loc146 = loc(callsite(#loc106 at #loc100)) +#loc147 = loc(callsite(#loc36 at #loc100)) +#loc148 = loc(callsite(#loc37 at #loc100)) +#loc149 = loc(callsite(#loc38 at #loc100)) +#loc150 = loc(callsite(#loc39 at #loc100)) +#loc151 = loc(callsite(#loc40 at #loc100)) +#loc152 = loc(callsite(#loc41 at #loc100)) +#loc153 = loc(callsite(#loc42 at #loc100)) +#loc154 = loc("tmp3_weight"(#loc132)) +#loc155 = loc(callsite(#loc133 at #loc16)) +#loc156 = loc(callsite(#loc135 at #loc16)) +#loc157 = loc(callsite(#loc137 at #loc16)) +#loc158 = loc(callsite(#loc140 at #loc16)) diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..6fc41cb13aa57de73d01922dd12878396e166719 --- /dev/null +++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttir @@ -0,0 +1,269 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("in_ptr1"(#loc)) +#loc74 = loc("in_ptr2"(#loc)) +#loc75 = loc("out_ptr2"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc78 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc78) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %cst_5 = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc79) + %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc80) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc81) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc82) + %tmp3_weight:3 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%tmp3_mean = %cst_0, %tmp3_m2 = %cst_0, %tmp3_weight_7 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc84) + %r0_index_8 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32> loc(#loc84) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst_5 : tensor<1x2048xi32> loc(#loc85) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc86) + %tmp0_9 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc135) + %tmp0_10 = arith.addi %r0_index_8, %tmp0_9 : tensor<1x2048xi32> loc(#loc87) + %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc88) + %tmp0_12 = tt.addptr %tmp0_11, %tmp0_10 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc88) + %tmp0_13 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc136) + %tmp0_14 = arith.andi %r0_mask, %tmp0_13 : tensor<1x2048xi1> loc(#loc89) + %tmp0_15 = tt.load %tmp0_12, %tmp0_14, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc90) + %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc91) + %1 = arith.cmpi eq, %r0_offset, %c0_i32 : i32 loc(#loc16) + %2:3 = scf.if %1 -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) { + scf.yield %cst_0, %tmp0_16, %cst_4 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc161) + } else { + %delta = arith.subf %tmp0_16, %tmp3_mean : tensor<1x2048xf32> loc(#loc138) + %new_weight = arith.addf %tmp3_weight_7, %cst_4 : tensor<1x2048xf32> loc(#loc162) + %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32> loc(#loc140) + %new_mean_20 = arith.addf %tmp3_mean, %new_mean : tensor<1x2048xf32> loc(#loc163) + %new_m2 = arith.subf %tmp0_16, %new_mean_20 : tensor<1x2048xf32> loc(#loc142) + %new_m2_21 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc143) + %new_m2_22 = arith.addf %tmp3_m2, %new_m2_21 : tensor<1x2048xf32> loc(#loc164) + scf.yield %new_m2_22, %new_mean_20, %new_weight : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc145) + } loc(#loc92) + %tmp3_mean_17 = arith.select %tmp0_14, %2#1, %tmp3_mean : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc101) + %tmp3_m2_18 = arith.select %tmp0_14, %2#0, %tmp3_m2 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc102) + %tmp3_weight_19 = arith.select %tmp0_14, %2#2, %tmp3_weight_7 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc103) + scf.yield %tmp3_mean_17, %tmp3_m2_18, %tmp3_weight_19 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc30) + } loc(#loc160) + %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: f32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: f32 loc(callsite(#loc1 at #loc2)), %arg10: f32 loc(callsite(#loc1 at #loc2)), %arg11: f32 loc(callsite(#loc1 at #loc2))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc146) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc147) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc148) + %w2_over_w_7 = arith.divf %arg11, %new_weight : f32 loc(#loc149) + %w2_over_w_8 = arith.select %w2_over_w, %cst, %w2_over_w_7 : f32 loc(#loc150) + %1 = arith.mulf %delta, %w2_over_w_8 : f32 loc(#loc151) + %2 = arith.addf %arg6, %1 : f32 loc(#loc152) + %3 = arith.addf %arg7, %arg10 : f32 loc(#loc153) + %4 = arith.mulf %delta, %delta : f32 loc(#loc154) + %5 = arith.mulf %4, %arg8 : f32 loc(#loc155) + %6 = arith.mulf %5, %w2_over_w_8 : f32 loc(#loc156) + %7 = arith.addf %3, %6 : f32 loc(#loc157) + tt.reduce.return %2, %7, %new_weight : f32, f32, f32 loc(#loc104) + }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc104) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc110) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc111) + scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc112) + %r0_index_7 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32> loc(#loc112) + %r0_mask = arith.cmpi slt, %r0_index_7, %cst_5 : tensor<1x2048xi32> loc(#loc113) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc114) + %tmp9_8 = tt.addptr %tmp9, %r0_index_7 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc114) + %tmp9_9 = tt.load %tmp9_8, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc115) + %tmp9_10 = arith.extf %tmp9_9 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc116) + %tmp12 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc117) + %tmp12_11 = tt.splat %tmp12 : i32 -> tensor<1x2048xi32> loc(#loc158) + %tmp12_12 = arith.addi %r0_index_7, %tmp12_11 : tensor<1x2048xi32> loc(#loc118) + %tmp12_13 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc119) + %tmp12_14 = tt.addptr %tmp12_13, %tmp12_12 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc119) + %tmp12_15 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc159) + %tmp12_16 = arith.andi %r0_mask, %tmp12_15 : tensor<1x2048xi1> loc(#loc120) + %tmp12_17 = tt.load %tmp12_14, %tmp12_16, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc121) + %tmp12_18 = arith.extf %tmp12_17 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc122) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc123) + %tmp23_19 = tt.addptr %tmp23, %r0_index_7 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc123) + %tmp23_20 = tt.load %tmp23_19, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc124) + %tmp23_21 = arith.extf %tmp23_20 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc125) + %tmp11 = arith.addf %tmp9_10, %cst_4 : tensor<1x2048xf32> loc(#loc126) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc127) + %tmp14_22 = arith.subf %tmp12_18, %tmp14 : tensor<1x2048xf32> loc(#loc127) + %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc128) + %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc129) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc130) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc131) + %tmp20_23 = arith.mulf %tmp14_22, %tmp20 : tensor<1x2048xf32> loc(#loc131) + %tmp22 = arith.mulf %tmp11, %tmp20_23 : tensor<1x2048xf32> loc(#loc132) + %tmp24 = arith.addf %tmp22, %tmp23_21 : tensor<1x2048xf32> loc(#loc133) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc69) + %2 = tt.addptr %1, %tmp12_12 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc69) + %3 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc70) + tt.store %2, %3, %tmp12_16 : tensor<1x2048x!tt.ptr> loc(#loc70) + } loc(#loc46) + tt.return loc(#loc71) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:27) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":32:43) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":33:31) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:62) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:51) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:58) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:8) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:43) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":52:31) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":53:29) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:47) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:42) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:35) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:62) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4) +#loc79 = loc("xoffset"(#loc3)) +#loc80 = loc("xmask"(#loc4)) +#loc81 = loc("r0_base"(#loc5)) +#loc82 = loc("r0_base"(#loc6)) +#loc83 = loc("tmp3_mean"(#loc7)) +#loc84 = loc("r0_index"(#loc8)) +#loc85 = loc("r0_mask"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp0"(#loc13)) +#loc90 = loc("tmp0"(#loc14)) +#loc91 = loc("tmp0"(#loc15)) +#loc92 = loc(callsite(#loc17 at #loc18)) +#loc93 = loc("new_m2"(#loc19)) +#loc94 = loc("delta"(#loc20)) +#loc95 = loc("new_weight"(#loc21)) +#loc96 = loc("new_mean"(#loc22)) +#loc97 = loc("new_mean"(#loc23)) +#loc98 = loc("new_m2"(#loc24)) +#loc99 = loc("new_m2"(#loc25)) +#loc100 = loc("new_m2"(#loc26)) +#loc101 = loc("tmp3_mean"(#loc27)) +#loc102 = loc("tmp3_m2"(#loc28)) +#loc103 = loc("tmp3_weight"(#loc29)) +#loc104 = loc(callsite(#loc31 at #loc2)) +#loc105 = loc("delta"(#loc32)) +#loc106 = loc("new_weight"(#loc33)) +#loc107 = loc("w2_over_w"(#loc34)) +#loc108 = loc("w2_over_w"(#loc35)) +#loc109 = loc("w2_over_w"(#loc36)) +#loc110 = loc("tmp3"(#loc44)) +#loc111 = loc("tmp7"(#loc45)) +#loc112 = loc("r0_index"(#loc47)) +#loc113 = loc("r0_mask"(#loc48)) +#loc114 = loc("tmp9"(#loc49)) +#loc115 = loc("tmp9"(#loc50)) +#loc116 = loc("tmp9"(#loc51)) +#loc117 = loc("tmp12"(#loc52)) +#loc118 = loc("tmp12"(#loc53)) +#loc119 = loc("tmp12"(#loc54)) +#loc120 = loc("tmp12"(#loc55)) +#loc121 = loc("tmp12"(#loc56)) +#loc122 = loc("tmp12"(#loc57)) +#loc123 = loc("tmp23"(#loc58)) +#loc124 = loc("tmp23"(#loc59)) +#loc125 = loc("tmp23"(#loc60)) +#loc126 = loc("tmp11"(#loc61)) +#loc127 = loc("tmp14"(#loc62)) +#loc128 = loc("tmp16"(#loc63)) +#loc129 = loc("tmp18"(#loc64)) +#loc130 = loc("tmp19"(#loc65)) +#loc131 = loc("tmp20"(#loc66)) +#loc132 = loc("tmp22"(#loc67)) +#loc133 = loc("tmp24"(#loc68)) +#loc134 = loc("tmp3_m2"(#loc83)) +#loc135 = loc(fused[#loc87, #loc86]) +#loc136 = loc(fused[#loc89, #loc80]) +#loc137 = loc("new_m2"(#loc93)) +#loc138 = loc(callsite(#loc94 at #loc18)) +#loc139 = loc("new_weight"(#loc95)) +#loc140 = loc(callsite(#loc96 at #loc18)) +#loc141 = loc("new_mean"(#loc97)) +#loc142 = loc(callsite(#loc98 at #loc18)) +#loc143 = loc(callsite(#loc99 at #loc18)) +#loc144 = loc("new_m2"(#loc100)) +#loc145 = loc(callsite(#loc100 at #loc18)) +#loc146 = loc(callsite(#loc105 at #loc104)) +#loc147 = loc(callsite(#loc106 at #loc104)) +#loc148 = loc(callsite(#loc107 at #loc104)) +#loc149 = loc(callsite(#loc108 at #loc104)) +#loc150 = loc(callsite(#loc109 at #loc104)) +#loc151 = loc(callsite(#loc37 at #loc104)) +#loc152 = loc(callsite(#loc38 at #loc104)) +#loc153 = loc(callsite(#loc39 at #loc104)) +#loc154 = loc(callsite(#loc40 at #loc104)) +#loc155 = loc(callsite(#loc41 at #loc104)) +#loc156 = loc(callsite(#loc42 at #loc104)) +#loc157 = loc(callsite(#loc43 at #loc104)) +#loc158 = loc(fused[#loc118, #loc117]) +#loc159 = loc(fused[#loc120, #loc80]) +#loc160 = loc("tmp3_weight"(#loc134)) +#loc161 = loc(callsite(#loc137 at #loc18)) +#loc162 = loc(callsite(#loc139 at #loc18)) +#loc163 = loc(callsite(#loc141 at #loc18)) +#loc164 = loc(callsite(#loc144 at #loc18)) diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/__grp__triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a31d4092dcbb71d34fd65e75e337fab7cfefb010 --- /dev/null +++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/__grp__triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.json"}} \ No newline at end of file diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..655a79dd32b3f8867b16f43ee14dfa4de28af9d4 Binary files /dev/null and b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.cubin differ diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.json b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fc6977d8f2b5c9294ec3a7cbfd3a0141532ce619 --- /dev/null +++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"hash": "fd632643c71597c2e2b202e5443f3086a23da537fcb2398fd43a49dcb5fa652a", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"} \ No newline at end of file diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.llir b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..e3ea5cfdd31f0e666e32da2aec1289ee2040aea8 --- /dev/null +++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.llir @@ -0,0 +1,161 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 6, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 504, !dbg !9 + %11 = lshr exact i32 %10, 3, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = shl nuw nsw i32 %9, 3, !dbg !11 + %14 = and i32 %13, 56, !dbg !11 + %15 = sdiv i32 %12, 32, !dbg !12 + %16 = mul i32 %15, 32, !dbg !13 + %.decomposed = sub i32 %12, %16, !dbg !13 + %17 = shl nsw i32 %.decomposed, 7, !dbg !14 + %18 = mul i32 %15, 12288, !dbg !15 + %19 = or disjoint i32 %17, %14 + %20 = add i32 %19, %18 + %21 = sext i32 %20 to i64, !dbg !16 + %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !16 + %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %22, i64 %23, i1 true) #4, !dbg !17 + %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !17 + %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17 + %27 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !17 + %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !17 + %29 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !17 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !17 + %31 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !17 + %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !17 + %33 = sext i32 %20 to i64, !dbg !16 + %34 = getelementptr bfloat, ptr addrspace(1) %0, i64 %33, !dbg !16 + %35 = getelementptr i8, ptr addrspace(1) %34, i64 128, !dbg !16 + %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %35, i64 %36, i1 true) #4, !dbg !17 + %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !17 + %39 = bitcast i32 %38 to <2 x bfloat>, !dbg !17 + %40 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !17 + %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !17 + %42 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !17 + %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !17 + %44 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !17 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !17 + %46 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !18 + %47 = fmul <2 x float> %46, %46, !dbg !19 + %48 = fpext <2 x bfloat> %39 to <2 x float>, !dbg !18 + %49 = fmul <2 x float> %48, %48, !dbg !19 + %50 = fadd <2 x float> %47, %49, !dbg !20 + %51 = fpext <2 x bfloat> %28 to <2 x float>, !dbg !18 + %52 = fmul <2 x float> %51, %51, !dbg !19 + %53 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !18 + %54 = fmul <2 x float> %53, %53, !dbg !19 + %55 = fadd <2 x float> %52, %54, !dbg !20 + %56 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !18 + %57 = fmul <2 x float> %56, %56, !dbg !19 + %58 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !18 + %59 = fmul <2 x float> %58, %58, !dbg !19 + %60 = fadd <2 x float> %57, %59, !dbg !20 + %61 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !18 + %62 = fmul <2 x float> %61, %61, !dbg !19 + %63 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !18 + %64 = fmul <2 x float> %63, %63, !dbg !19 + %65 = fadd <2 x float> %62, %64, !dbg !20 + %66 = and i32 %9, 63, !dbg !9 + %67 = or disjoint i32 %8, %66, !dbg !10 + %shift = shufflevector <2 x float> %50, <2 x float> poison, <2 x i32> , !dbg !21 + %foldExtExtBinop = fadd <2 x float> %50, %shift, !dbg !21 + %foldExtExtBinop9 = fadd <2 x float> %55, %foldExtExtBinop, !dbg !21 + %shift11 = shufflevector <2 x float> %55, <2 x float> poison, <2 x i32> , !dbg !21 + %foldExtExtBinop12 = fadd <2 x float> %shift11, %foldExtExtBinop9, !dbg !21 + %foldExtExtBinop14 = fadd <2 x float> %60, %foldExtExtBinop12, !dbg !21 + %shift16 = shufflevector <2 x float> %60, <2 x float> poison, <2 x i32> , !dbg !21 + %foldExtExtBinop17 = fadd <2 x float> %shift16, %foldExtExtBinop14, !dbg !21 + %foldExtExtBinop19 = fadd <2 x float> %65, %foldExtExtBinop17, !dbg !21 + %shift21 = shufflevector <2 x float> %65, <2 x float> poison, <2 x i32> , !dbg !21 + %foldExtExtBinop22 = fadd <2 x float> %shift21, %foldExtExtBinop19, !dbg !21 + %68 = extractelement <2 x float> %foldExtExtBinop22, i64 0, !dbg !21 + %69 = bitcast float %68 to i32, !dbg !24 + %70 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %69, i32 4, i32 31), !dbg !24 + %71 = bitcast i32 %70 to float, !dbg !24 + %72 = fadd float %68, %71, !dbg !21 + %73 = bitcast float %72 to i32, !dbg !24 + %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %73, i32 2, i32 31), !dbg !24 + %75 = bitcast i32 %74 to float, !dbg !24 + %76 = fadd float %72, %75, !dbg !21 + %77 = bitcast float %76 to i32, !dbg !24 + %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 1, i32 31), !dbg !24 + %79 = bitcast i32 %78 to float, !dbg !24 + %80 = fadd float %76, %79, !dbg !21 + %81 = lshr exact i32 %10, 1, !dbg !27 + %82 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %81, !dbg !27 + store float %80, ptr addrspace(3) %82, align 4, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27 + %83 = shl nuw nsw i32 %66, 2, !dbg !27 + %84 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %83, !dbg !27 + %85 = load i32, ptr addrspace(3) %84, align 4, !dbg !27 + %86 = sext i32 %67 to i64, !dbg !28 + %87 = getelementptr float, ptr addrspace(1) %1, i64 %86, !dbg !28 + %88 = and i32 %9, 448, !dbg !29 + %89 = icmp eq i32 %88, 0, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %85, ptr addrspace(1) %87, i1 %89) #4, !dbg !29 + ret void, !dbg !30 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 56, scope: !4) +!16 = !DILocation(line: 38, column: 34, scope: !4) +!17 = !DILocation(line: 38, column: 61, scope: !4) +!18 = !DILocation(line: 38, column: 115, scope: !4) +!19 = !DILocation(line: 40, column: 22, scope: !4) +!20 = !DILocation(line: 42, column: 23, scope: !4) +!21 = !DILocation(line: 263, column: 15, scope: !22, inlinedAt: !24) +!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0) +!23 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!24 = !DILocation(line: 293, column: 36, scope: !22, inlinedAt: !25) +!25 = !DILocation(line: 44, column: 25, scope: !26) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!27 = !DILocation(line: 44, column: 28, scope: !4) +!28 = !DILocation(line: 45, column: 25, scope: !4) +!29 = !DILocation(line: 45, column: 36, scope: !4) +!30 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..fd61ffb64cd6bdd9aba3f2c2cce15adcc77f5dc3 --- /dev/null +++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ptx @@ -0,0 +1,557 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_1 +.visible .entry triton_red_fused__fused_rms_norm_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5 +) +.reqntid 512 +{ + .reg .pred %p<3>; + .reg .b16 %rs<17>; + .reg .b32 %r<81>; + .reg .b64 %rd<8>; + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 + +// %bb.0: + ld.param.b64 %rd6, [triton_red_fused__fused_rms_norm_view_1_param_0]; + ld.param.b64 %rd7, [triton_red_fused__fused_rms_norm_view_1_param_1]; +$L__tmp0: + .loc 1 23 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28 + mov.u32 %r11, %ctaid.x; + .loc 1 23 33 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33 + shl.b32 %r12, %r11, 6; + .loc 1 24 44 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44 + mov.u32 %r13, %tid.x; + and.b32 %r14, %r13, 504; + bfe.u32 %r15, %r13, 3, 6; + .loc 1 24 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23 + or.b32 %r16, %r15, %r12; + .loc 1 26 37 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37 + shl.b32 %r17, %r13, 3; + and.b32 %r18, %r17, 56; + .loc 1 29 19 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19 + bfe.s32 %r19, %r11, 25, 1; + shr.u32 %r20, %r19, 27; + add.s32 %r21, %r16, %r20; + shr.u32 %r22, %r21, 5; + .loc 1 28 19 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:28:19 + and.b32 %r23, %r21, 33554400; + sub.s32 %r24, %r16, %r23; + .loc 1 38 45 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:45 + shl.b32 %r25, %r24, 7; + or.b32 %r26, %r25, %r18; + mad.lo.s32 %r27, %r22, 12288, %r26; + .loc 1 38 34 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34 + mad.wide.s32 %rd1, %r27, 2, %rd6; + .loc 1 38 61 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + .loc 1 38 34 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34 + add.s64 %rd3, %rd1, 128; + .loc 1 38 61 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 38 115 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r28, %rs1; + cvt.f32.bf16 %r29, %rs2; + mov.b32 {%rs3, %rs4}, %r6; + cvt.f32.bf16 %r30, %rs4; + cvt.f32.bf16 %r31, %rs3; + .loc 1 40 22 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22 + mul.f32 %r32, %r31, %r31; + mul.f32 %r33, %r30, %r30; + .loc 1 42 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23 + fma.rn.f32 %r34, %r29, %r29, %r33; + fma.rn.f32 %r35, %r28, %r28, %r32; + .loc 1 38 115 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115 + mov.b32 {%rs5, %rs6}, %r2; + cvt.f32.bf16 %r36, %rs5; + cvt.f32.bf16 %r37, %rs6; + mov.b32 {%rs7, %rs8}, %r7; + cvt.f32.bf16 %r38, %rs8; + cvt.f32.bf16 %r39, %rs7; + .loc 1 40 22 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22 + mul.f32 %r40, %r39, %r39; + mul.f32 %r41, %r38, %r38; + .loc 1 42 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23 + fma.rn.f32 %r42, %r37, %r37, %r41; + fma.rn.f32 %r43, %r36, %r36, %r40; + .loc 1 38 115 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115 + mov.b32 {%rs9, %rs10}, %r3; + cvt.f32.bf16 %r44, %rs9; + cvt.f32.bf16 %r45, %rs10; + mov.b32 {%rs11, %rs12}, %r8; + cvt.f32.bf16 %r46, %rs12; + cvt.f32.bf16 %r47, %rs11; + .loc 1 40 22 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22 + mul.f32 %r48, %r47, %r47; + mul.f32 %r49, %r46, %r46; + .loc 1 42 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23 + fma.rn.f32 %r50, %r45, %r45, %r49; + fma.rn.f32 %r51, %r44, %r44, %r48; + .loc 1 38 115 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115 + mov.b32 {%rs13, %rs14}, %r4; + cvt.f32.bf16 %r52, %rs13; + cvt.f32.bf16 %r53, %rs14; + mov.b32 {%rs15, %rs16}, %r9; + cvt.f32.bf16 %r54, %rs16; + cvt.f32.bf16 %r55, %rs15; + .loc 1 40 22 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22 + mul.f32 %r56, %r55, %r55; + mul.f32 %r57, %r54, %r54; + .loc 1 42 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23 + fma.rn.f32 %r58, %r53, %r53, %r57; + fma.rn.f32 %r59, %r52, %r52, %r56; + .loc 1 24 44 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44 + and.b32 %r60, %r13, 63; + .loc 1 24 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23 + or.b32 %r61, %r12, %r60; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r62, %r35, %r34; + add.f32 %r63, %r43, %r62; + add.f32 %r64, %r42, %r63; + add.f32 %r65, %r51, %r64; + add.f32 %r66, %r50, %r65; + add.f32 %r67, %r59, %r66; + add.f32 %r68, %r58, %r67; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r69, %r68, 4, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r70, %r68, %r69; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r71, %r70, 2, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r72, %r70, %r71; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r73, %r72, 1, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r74, %r72, %r73; +$L__tmp8: + .loc 1 44 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28 + shr.u32 %r75, %r14, 1; + mov.b32 %r76, global_smem; + add.s32 %r77, %r76, %r75; + st.shared.b32 [%r77], %r74; + bar.sync 0; + shl.b32 %r78, %r60, 2; + add.s32 %r79, %r76, %r78; + ld.shared.b32 %r10, [%r79]; + .loc 1 45 25 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25 + mad.wide.s32 %rd5, %r61, 4, %rd7; + .loc 1 45 36 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36 + and.b32 %r80, %r13, 448; + setp.eq.b32 %p2, %r80, 0; + // begin inline asm + @%p2 st.global.b32 [ %rd5 + 0 ], { %r10 }; + // end inline asm + .loc 1 45 4 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 113 +.b8 105 +.b8 116 +.b8 120 +.b8 53 +.b8 104 +.b8 119 +.b8 117 +.b8 112 +.b8 107 +.b8 98 +.b8 106 +.b8 109 +.b8 99 +.b8 115 +.b8 111 +.b8 121 +.b8 107 +.b8 113 +.b8 101 +.b8 112 +.b8 122 +.b8 113 +.b8 99 +.b8 55 +.b8 122 +.b8 99 +.b8 120 +.b8 106 +.b8 99 +.b8 98 +.b8 53 +.b8 97 +.b8 99 +.b8 113 +.b8 107 +.b8 105 +.b8 55 +.b8 122 +.b8 99 +.b8 115 +.b8 106 +.b8 105 +.b8 102 +.b8 114 +.b8 110 +.b8 114 +.b8 122 +.b8 99 +.b8 114 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 113 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.source b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..529a4488127bba20b5fc66767666a84ea578ac2b --- /dev/null +++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 65536 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 64 : i32 loc(#loc49) + %xoffset_3 = arith.constant 64 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<64x64xi1> loc(#loc53) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c64_i32 = arith.constant 64 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x64xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<64x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<64x64xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<64x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<64x64xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<64x64xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<64x64xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<64x64xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<64x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x64xf32> loc("input"(#loc33))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc34) + tt.return %0 : tensor<64xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc37) + tt.return %1 : tensor<64xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..36102b4bece7ffab66468907d370b1e032cf03c9 --- /dev/null +++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttgir @@ -0,0 +1,120 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [16, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc1 = loc(unknown) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc30 = loc("in_ptr0"(#loc)) +#loc31 = loc("out_ptr0"(#loc)) +#loc32 = loc("xnumel"(#loc)) +#loc33 = loc("r0_numel"(#loc)) +#loc54 = loc("tmp4"(#loc24)) +#loc57 = loc(callsite(#loc1 at #loc54)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36) + %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc36) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc36) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc37) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc37) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc37) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc37) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc38) + %x0 = arith.remsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc39) + %x1 = arith.divsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc40) + %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc41) + %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc42) + %tmp0_15 = arith.muli %x1, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc43) + %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc44) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc45) + %_tmp4 = scf.for %_tmp4_20 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg5 = %cst_4) -> (tensor<64x64xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp4_20 : i32 -> tensor<1x64xi32, #blocked> loc(#loc47) + %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x64xi32, #blocked> loc(#loc47) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst : tensor<1x64xi32, #blocked> loc(#loc48) + %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc42) + %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<64x64xi32, #blocked> loc(#loc42) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<64x64xi32, #blocked> loc(#loc44) + %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc45) + %tmp0_26 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc49) + %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_3 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc49) + %tmp0_28 = arith.extf %tmp0_27 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc50) + %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<64x64xf32, #blocked> loc(#loc51) + %tmp5 = arith.addf %arg5, %tmp2 : tensor<64x64xf32, #blocked> loc(#loc52) + %_tmp4_29 = arith.select %tmp0_26, %tmp5, %arg5 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc53) + scf.yield %_tmp4_29 : tensor<64x64xf32, #blocked> loc(#loc22) + } loc(#loc46) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))): + %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58) + tt.reduce.return %tmp4_22 : f32 loc(#loc56) + }) : (tensor<64x64xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56) + %tmp4_18 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55) + %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc55) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc27) + %1 = tt.addptr %0, %xindex_12 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc27) + tt.store %1, %tmp4_19 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc28) + tt.return loc(#loc29) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc34 = loc("xoffset"(#loc2)) +#loc35 = loc("xoffset"(#loc3)) +#loc36 = loc("xindex"(#loc4)) +#loc37 = loc("xindex"(#loc5)) +#loc38 = loc("r0_base"(#loc6)) +#loc39 = loc("x0"(#loc7)) +#loc40 = loc("x1"(#loc8)) +#loc41 = loc("tmp0"(#loc9)) +#loc42 = loc("tmp0"(#loc10)) +#loc43 = loc("tmp0"(#loc11)) +#loc44 = loc("tmp0"(#loc12)) +#loc45 = loc("tmp0"(#loc13)) +#loc46 = loc("_tmp4"(#loc14)) +#loc47 = loc("r0_index"(#loc15)) +#loc48 = loc("r0_mask"(#loc16)) +#loc49 = loc("tmp0"(#loc17)) +#loc50 = loc("tmp0"(#loc18)) +#loc51 = loc("tmp2"(#loc19)) +#loc52 = loc("tmp5"(#loc20)) +#loc53 = loc("_tmp4"(#loc21)) +#loc55 = loc("tmp4"(#loc26)) +#loc56 = loc(callsite(#loc23 at #loc54)) +#loc58 = loc(callsite(#loc25 at #loc56)) diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..d64e870d3c119a1a0afa92abf6aa75b33eb07d99 --- /dev/null +++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttir @@ -0,0 +1,114 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc1 = loc(unknown) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc31 = loc("in_ptr0"(#loc)) +#loc32 = loc("out_ptr0"(#loc)) +#loc33 = loc("xnumel"(#loc)) +#loc34 = loc("r0_numel"(#loc)) +#loc56 = loc("tmp4"(#loc25)) +#loc59 = loc(callsite(#loc1 at #loc56)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc35) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc36) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc37) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc38) + %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc39) + %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc39) + %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc40) + %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc41) + %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc42) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4_10 = %cst_3) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc44) + %r0_index_11 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc44) + %r0_mask = arith.cmpi slt, %r0_index_11, %cst_2 : tensor<1x64xi32> loc(#loc45) + %tmp0 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc46) + %tmp0_12 = tt.broadcast %r0_index_11 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc47) + %tmp0_13 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc47) + %tmp0_14 = arith.addi %tmp0_12, %tmp0_13 : tensor<64x64xi32> loc(#loc47) + %tmp0_15 = arith.muli %x1, %cst_0 : tensor<64x1xi32> loc(#loc48) + %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc49) + %tmp0_17 = arith.addi %tmp0_14, %tmp0_16 : tensor<64x64xi32> loc(#loc49) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc50) + %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc50) + %tmp0_20 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc51) + %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc51) + %tmp0_22 = arith.extf %tmp0_21 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc52) + %tmp2 = arith.mulf %tmp0_22, %tmp0_22 : tensor<64x64xf32> loc(#loc53) + %tmp5 = arith.addf %_tmp4_10, %tmp2 : tensor<64x64xf32> loc(#loc54) + %_tmp4_23 = arith.select %tmp0_20, %tmp5, %_tmp4_10 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc55) + scf.yield %_tmp4_23 : tensor<64x64xf32> loc(#loc23) + } loc(#loc43) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_10: f32 loc(callsite(#loc1 at #loc56)), %tmp4_11: f32 loc(callsite(#loc1 at #loc56))): + %tmp4_12 = arith.addf %tmp4_10, %tmp4_11 : f32 loc(#loc60) + tt.reduce.return %tmp4_12 : f32 loc(#loc58) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc58) + %tmp4_9 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc57) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc28) + %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc28) + tt.store %1, %tmp4_9 : tensor<64x1x!tt.ptr> loc(#loc29) + tt.return loc(#loc30) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc35 = loc("xoffset"(#loc3)) +#loc36 = loc("xoffset"(#loc4)) +#loc37 = loc("xindex"(#loc5)) +#loc38 = loc("xindex"(#loc6)) +#loc39 = loc("xindex"(#loc7)) +#loc40 = loc("r0_base"(#loc8)) +#loc41 = loc("x0"(#loc9)) +#loc42 = loc("x1"(#loc10)) +#loc43 = loc("_tmp4"(#loc2)) +#loc44 = loc("r0_index"(#loc11)) +#loc45 = loc("r0_mask"(#loc12)) +#loc46 = loc("tmp0"(#loc13)) +#loc47 = loc("tmp0"(#loc14)) +#loc48 = loc("tmp0"(#loc15)) +#loc49 = loc("tmp0"(#loc16)) +#loc50 = loc("tmp0"(#loc17)) +#loc51 = loc("tmp0"(#loc18)) +#loc52 = loc("tmp0"(#loc19)) +#loc53 = loc("tmp2"(#loc20)) +#loc54 = loc("tmp5"(#loc21)) +#loc55 = loc("_tmp4"(#loc22)) +#loc57 = loc("tmp4"(#loc27)) +#loc58 = loc(callsite(#loc24 at #loc56)) +#loc60 = loc(callsite(#loc26 at #loc58)) diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0c0622ad43d9b5c2f62e6ae3993c7449a33e3595 --- /dev/null +++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json"}} \ No newline at end of file diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..da34a1a49cdf21cf19a41dff638a12c20a832309 Binary files /dev/null and b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin differ diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0238f8479df5f7e880d3fdcc2fa1d012ee7880bd --- /dev/null +++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json @@ -0,0 +1 @@ +{"hash": "fe4407d29e040f0e8efc23458aee0cbcf1535e9850d07966b0e7421f96d30e0d", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0"} \ No newline at end of file diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..8ef87e7332aadae0020d3ec548f4433f63c315ee --- /dev/null +++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir @@ -0,0 +1,71 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 10, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 3, !dbg !9 + %10 = and i32 %9, 1016, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = sdiv i32 %11, 128, !dbg !11 + %13 = mul i32 %12, 128, !dbg !12 + %.decomposed = sub i32 %11, %13, !dbg !12 + %14 = srem i32 %12, 2304, !dbg !13 + %15 = sdiv i32 %11, 294912, !dbg !14 + %16 = shl nsw i32 %15, 7, !dbg !15 + %17 = add nsw i32 %16, %.decomposed, !dbg !16 + %18 = shl nsw i32 %14, 12, !dbg !17 + %19 = add nsw i32 %17, %18, !dbg !18 + %20 = sext i32 %19 to i64, !dbg !19 + %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19 + %22 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %21) #2, !dbg !20 + %23 = extractvalue { i32, i32, i32, i32 } %22, 0, !dbg !20 + %24 = extractvalue { i32, i32, i32, i32 } %22, 1, !dbg !20 + %25 = extractvalue { i32, i32, i32, i32 } %22, 2, !dbg !20 + %26 = extractvalue { i32, i32, i32, i32 } %22, 3, !dbg !20 + %27 = sext i32 %11 to i64, !dbg !21 + %28 = getelementptr bfloat, ptr addrspace(1) %1, i64 %27, !dbg !21 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %23, i32 %24, i32 %25, i32 %26, ptr addrspace(1) %28) #2, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0", linkageName: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 23, column: 19, scope: !4) +!13 = !DILocation(line: 24, column: 28, scope: !4) +!14 = !DILocation(line: 25, column: 19, scope: !4) +!15 = !DILocation(line: 27, column: 39, scope: !4) +!16 = !DILocation(line: 27, column: 35, scope: !4) +!17 = !DILocation(line: 27, column: 49, scope: !4) +!18 = !DILocation(line: 27, column: 44, scope: !4) +!19 = !DILocation(line: 27, column: 30, scope: !4) +!20 = !DILocation(line: 27, column: 54, scope: !4) +!21 = !DILocation(line: 28, column: 25, scope: !4) +!22 = !DILocation(line: 28, column: 36, scope: !4) +!23 = !DILocation(line: 28, column: 4, scope: !4) diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..2d5ab26549fe9d8870cfeed0aedfd9d6465e2dc3 --- /dev/null +++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx @@ -0,0 +1,332 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 // -- Begin function triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 + // @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 +.visible .entry triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0( + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_1, + .param .u32 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_4 +) +.reqntid 128 +{ + .reg .b32 %r<31>; + .reg .b64 %rd<5>; + .loc 1 18 0 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_0]; + ld.param.b64 %rd4, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_1]; +$L__tmp0: + .loc 1 20 28 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:20:28 + mov.u32 %r5, %ctaid.x; + .loc 1 20 33 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:20:33 + shl.b32 %r6, %r5, 10; + .loc 1 21 36 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:21:36 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 3; + and.b32 %r9, %r8, 1016; + .loc 1 21 23 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:21:23 + or.b32 %r10, %r9, %r6; + .loc 1 24 21 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:24:21 + bfe.s32 %r11, %r5, 21, 1; + shr.u32 %r12, %r11, 25; + add.s32 %r13, %r10, %r12; + shr.s32 %r14, %r13, 7; + .loc 1 23 19 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:23:19 + and.b32 %r15, %r13, -128; + sub.s32 %r16, %r10, %r15; + .loc 1 24 28 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:24:28 + mul.hi.s32 %r17, %r14, 954437177; + shr.u32 %r18, %r17, 31; + shr.u32 %r19, %r17, 9; + add.s32 %r20, %r19, %r18; + mul.lo.s32 %r21, %r20, 2304; + sub.s32 %r22, %r14, %r21; + .loc 1 25 19 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:25:19 + mul.hi.s32 %r23, %r10, 954437177; + shr.u32 %r24, %r23, 31; + shr.s32 %r25, %r23, 16; + add.s32 %r26, %r25, %r24; + .loc 1 27 39 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:39 + shl.b32 %r27, %r26, 7; + .loc 1 27 35 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:35 + add.s32 %r28, %r27, %r16; + .loc 1 27 49 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:49 + shl.b32 %r29, %r22, 12; + .loc 1 27 44 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:44 + add.s32 %r30, %r28, %r29; + .loc 1 27 30 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:30 + mad.wide.s32 %rd1, %r30, 2, %rd3; + .loc 1 27 54 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:54 + // begin inline asm + mov.u32 %r1, 0x0; + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 28 25 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:25 + mad.wide.s32 %rd2, %r10, 2, %rd4; + .loc 1 28 36 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:36 + // begin inline asm + st.global.v4.b32 [ %rd2 + 0 ], { %r1, %r2, %r3, %r4 }; + // end inline asm + .loc 1 28 4 // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 101 +.b8 105 +.b8 112 +.b8 106 +.b8 120 +.b8 97 +.b8 117 +.b8 115 +.b8 97 +.b8 117 +.b8 122 +.b8 108 +.b8 52 +.b8 109 +.b8 99 +.b8 99 +.b8 50 +.b8 51 +.b8 51 +.b8 102 +.b8 112 +.b8 101 +.b8 117 +.b8 98 +.b8 102 +.b8 115 +.b8 51 +.b8 117 +.b8 107 +.b8 53 +.b8 110 +.b8 105 +.b8 53 +.b8 98 +.b8 106 +.b8 113 +.b8 98 +.b8 108 +.b8 50 +.b8 113 +.b8 119 +.b8 116 +.b8 111 +.b8 119 +.b8 106 +.b8 119 +.b8 114 +.b8 108 +.b8 55 +.b8 99 +.b8 100 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 101 +.b8 105 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source new file mode 100644 index 0000000000000000000000000000000000000000..1f405f12ca51bc6e06093fd082ab80276f47b60b --- /dev/null +++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source @@ -0,0 +1,90 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("out_ptr0"(#loc)) +#loc23 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc26) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc26) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc28) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc28) + %xmask = arith.constant true loc(#loc29) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc29) + %x0 = arith.constant 128 : i32 loc(#loc30) + %x0_7 = arith.constant 128 : i32 loc(#loc30) + %x0_8 = arith.constant dense<128> : tensor<1024xi32> loc(#loc30) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc30) + %x1 = arith.constant 128 : i32 loc(#loc31) + %x1_10 = arith.constant 128 : i32 loc(#loc31) + %x1_11 = arith.constant dense<128> : tensor<1024xi32> loc(#loc31) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc31) + %x1_13 = arith.constant 2304 : i32 loc(#loc32) + %x1_14 = arith.constant 2304 : i32 loc(#loc32) + %x1_15 = arith.constant dense<2304> : tensor<1024xi32> loc(#loc32) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1024xi32> loc(#loc32) + %x2 = arith.constant 294912 : i32 loc(#loc33) + %x2_17 = arith.constant 294912 : i32 loc(#loc33) + %x2_18 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc33) + %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<1024xi32> loc(#loc33) + %tmp0 = arith.constant 128 : i32 loc(#loc34) + %tmp0_20 = arith.constant 128 : i32 loc(#loc34) + %tmp0_21 = arith.constant dense<128> : tensor<1024xi32> loc(#loc34) + %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<1024xi32> loc(#loc34) + %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<1024xi32> loc(#loc35) + %tmp0_24 = arith.constant 4096 : i32 loc(#loc36) + %tmp0_25 = arith.constant 4096 : i32 loc(#loc36) + %tmp0_26 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc36) + %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<1024xi32> loc(#loc36) + %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<1024xi32> loc(#loc37) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc38) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc38) + %tmp0_31 = tt.load %tmp0_30 : tensor<1024x!tt.ptr> loc(#loc39) + %tmp0_32 = arith.extf %tmp0_31 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc40) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc18) + %2 = arith.truncf %tmp0_32 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:63) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4) +#loc24 = loc("xnumel"(#loc1)) +#loc25 = loc("xoffset"(#loc2)) +#loc26 = loc("xoffset"(#loc3)) +#loc27 = loc("xindex"(#loc4)) +#loc28 = loc("xindex"(#loc5)) +#loc29 = loc("xmask"(#loc6)) +#loc30 = loc("x0"(#loc7)) +#loc31 = loc("x1"(#loc8)) +#loc32 = loc("x1"(#loc9)) +#loc33 = loc("x2"(#loc10)) +#loc34 = loc("tmp0"(#loc11)) +#loc35 = loc("tmp0"(#loc12)) +#loc36 = loc("tmp0"(#loc13)) +#loc37 = loc("tmp0"(#loc14)) +#loc38 = loc("tmp0"(#loc15)) +#loc39 = loc("tmp0"(#loc16)) +#loc40 = loc("tmp0"(#loc17)) diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..7031b76872929bb56852e7f1e9c3b24ec9ea06a7 --- /dev/null +++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir @@ -0,0 +1,66 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<2304> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<294912> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc22) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc23) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc24) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc25) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc25) + %x0 = arith.remsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc26) + %x1 = arith.divsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc27) + %x1_6 = arith.remsi %x1, %cst_0 : tensor<1024xi32, #blocked> loc(#loc28) + %x2 = arith.divsi %xindex_5, %cst_1 : tensor<1024xi32, #blocked> loc(#loc29) + %tmp0 = arith.muli %x2, %cst : tensor<1024xi32, #blocked> loc(#loc30) + %tmp0_7 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc31) + %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<1024xi32, #blocked> loc(#loc32) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32, #blocked> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr, #blocked> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc16) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr, #blocked> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4) +#loc22 = loc("xoffset"(#loc2)) +#loc23 = loc("xoffset"(#loc3)) +#loc24 = loc("xindex"(#loc4)) +#loc25 = loc("xindex"(#loc5)) +#loc26 = loc("x0"(#loc6)) +#loc27 = loc("x1"(#loc7)) +#loc28 = loc("x1"(#loc8)) +#loc29 = loc("x2"(#loc9)) +#loc30 = loc("tmp0"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..4fe97caa03918e88f85e6f7e93683d6486703a3b --- /dev/null +++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir @@ -0,0 +1,65 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc22) + %x2 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc23) + %x1 = arith.constant dense<2304> : tensor<1024xi32> loc(#loc24) + %cst = arith.constant dense<128> : tensor<1024xi32> loc(#loc4) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc4) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc26) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc28) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc28) + %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32> loc(#loc29) + %x1_3 = arith.divsi %xindex_2, %cst : tensor<1024xi32> loc(#loc30) + %x1_4 = arith.remsi %x1_3, %x1 : tensor<1024xi32> loc(#loc24) + %x2_5 = arith.divsi %xindex_2, %x2 : tensor<1024xi32> loc(#loc23) + %tmp0_6 = arith.muli %x2_5, %cst : tensor<1024xi32> loc(#loc31) + %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<1024xi32> loc(#loc32) + %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<1024xi32> loc(#loc22) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc16) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28) +#loc4 = loc(unknown) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4) +#loc22 = loc("tmp0"(#loc1)) +#loc23 = loc("x2"(#loc2)) +#loc24 = loc("x1"(#loc3)) +#loc25 = loc("xoffset"(#loc5)) +#loc26 = loc("xoffset"(#loc6)) +#loc27 = loc("xindex"(#loc7)) +#loc28 = loc("xindex"(#loc8)) +#loc29 = loc("x0"(#loc9)) +#loc30 = loc("x1"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/__grp__triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cd60d5946f236eb87845180c714237103291ecda --- /dev/null +++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/__grp__triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.json"}} \ No newline at end of file diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..ad221bc8ff2da7aebe027994e2a35284823235d5 Binary files /dev/null and b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.cubin differ diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.json b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c751eac3b3d6fab69ba84ce0a09e5c5cc800886b --- /dev/null +++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"hash": "032c1ed5f78de1dcb11539cb878c06ddf1b03d2db1b1296b1d30afe556154989", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"} \ No newline at end of file diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.llir b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..5c07cf1b238e0f4f0a0639559007395130f6c1b6 --- /dev/null +++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.llir @@ -0,0 +1,161 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 6, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 504, !dbg !9 + %11 = lshr exact i32 %10, 3, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = shl nuw nsw i32 %9, 3, !dbg !11 + %14 = and i32 %13, 56, !dbg !11 + %15 = sdiv i32 %12, 32, !dbg !12 + %16 = mul i32 %15, 32, !dbg !13 + %.decomposed = sub i32 %12, %16, !dbg !13 + %17 = shl nsw i32 %.decomposed, 7, !dbg !14 + %18 = mul i32 %15, 12288, !dbg !15 + %19 = or disjoint i32 %17, %14 + %20 = add i32 %19, %18 + %21 = sext i32 %20 to i64, !dbg !16 + %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !16 + %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %22, i64 %23, i1 true) #4, !dbg !17 + %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !17 + %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17 + %27 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !17 + %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !17 + %29 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !17 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !17 + %31 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !17 + %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !17 + %33 = sext i32 %20 to i64, !dbg !16 + %34 = getelementptr bfloat, ptr addrspace(1) %0, i64 %33, !dbg !16 + %35 = getelementptr i8, ptr addrspace(1) %34, i64 128, !dbg !16 + %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %35, i64 %36, i1 true) #4, !dbg !17 + %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !17 + %39 = bitcast i32 %38 to <2 x bfloat>, !dbg !17 + %40 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !17 + %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !17 + %42 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !17 + %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !17 + %44 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !17 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !17 + %46 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !18 + %47 = fmul <2 x float> %46, %46, !dbg !19 + %48 = fpext <2 x bfloat> %39 to <2 x float>, !dbg !18 + %49 = fmul <2 x float> %48, %48, !dbg !19 + %50 = fadd <2 x float> %47, %49, !dbg !20 + %51 = fpext <2 x bfloat> %28 to <2 x float>, !dbg !18 + %52 = fmul <2 x float> %51, %51, !dbg !19 + %53 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !18 + %54 = fmul <2 x float> %53, %53, !dbg !19 + %55 = fadd <2 x float> %52, %54, !dbg !20 + %56 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !18 + %57 = fmul <2 x float> %56, %56, !dbg !19 + %58 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !18 + %59 = fmul <2 x float> %58, %58, !dbg !19 + %60 = fadd <2 x float> %57, %59, !dbg !20 + %61 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !18 + %62 = fmul <2 x float> %61, %61, !dbg !19 + %63 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !18 + %64 = fmul <2 x float> %63, %63, !dbg !19 + %65 = fadd <2 x float> %62, %64, !dbg !20 + %66 = and i32 %9, 63, !dbg !9 + %67 = or disjoint i32 %8, %66, !dbg !10 + %shift = shufflevector <2 x float> %50, <2 x float> poison, <2 x i32> , !dbg !21 + %foldExtExtBinop = fadd <2 x float> %50, %shift, !dbg !21 + %foldExtExtBinop9 = fadd <2 x float> %55, %foldExtExtBinop, !dbg !21 + %shift11 = shufflevector <2 x float> %55, <2 x float> poison, <2 x i32> , !dbg !21 + %foldExtExtBinop12 = fadd <2 x float> %shift11, %foldExtExtBinop9, !dbg !21 + %foldExtExtBinop14 = fadd <2 x float> %60, %foldExtExtBinop12, !dbg !21 + %shift16 = shufflevector <2 x float> %60, <2 x float> poison, <2 x i32> , !dbg !21 + %foldExtExtBinop17 = fadd <2 x float> %shift16, %foldExtExtBinop14, !dbg !21 + %foldExtExtBinop19 = fadd <2 x float> %65, %foldExtExtBinop17, !dbg !21 + %shift21 = shufflevector <2 x float> %65, <2 x float> poison, <2 x i32> , !dbg !21 + %foldExtExtBinop22 = fadd <2 x float> %shift21, %foldExtExtBinop19, !dbg !21 + %68 = extractelement <2 x float> %foldExtExtBinop22, i64 0, !dbg !21 + %69 = bitcast float %68 to i32, !dbg !24 + %70 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %69, i32 4, i32 31), !dbg !24 + %71 = bitcast i32 %70 to float, !dbg !24 + %72 = fadd float %68, %71, !dbg !21 + %73 = bitcast float %72 to i32, !dbg !24 + %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %73, i32 2, i32 31), !dbg !24 + %75 = bitcast i32 %74 to float, !dbg !24 + %76 = fadd float %72, %75, !dbg !21 + %77 = bitcast float %76 to i32, !dbg !24 + %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 1, i32 31), !dbg !24 + %79 = bitcast i32 %78 to float, !dbg !24 + %80 = fadd float %76, %79, !dbg !21 + %81 = lshr exact i32 %10, 1, !dbg !27 + %82 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %81, !dbg !27 + store float %80, ptr addrspace(3) %82, align 4, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27 + %83 = shl nuw nsw i32 %66, 2, !dbg !27 + %84 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %83, !dbg !27 + %85 = load i32, ptr addrspace(3) %84, align 4, !dbg !27 + %86 = sext i32 %67 to i64, !dbg !28 + %87 = getelementptr float, ptr addrspace(1) %1, i64 %86, !dbg !28 + %88 = and i32 %9, 448, !dbg !29 + %89 = icmp eq i32 %88, 0, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %85, ptr addrspace(1) %87, i1 %89) #4, !dbg !29 + ret void, !dbg !30 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 56, scope: !4) +!16 = !DILocation(line: 38, column: 34, scope: !4) +!17 = !DILocation(line: 38, column: 61, scope: !4) +!18 = !DILocation(line: 38, column: 115, scope: !4) +!19 = !DILocation(line: 40, column: 22, scope: !4) +!20 = !DILocation(line: 42, column: 23, scope: !4) +!21 = !DILocation(line: 263, column: 15, scope: !22, inlinedAt: !24) +!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0) +!23 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!24 = !DILocation(line: 293, column: 36, scope: !22, inlinedAt: !25) +!25 = !DILocation(line: 44, column: 25, scope: !26) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!27 = !DILocation(line: 44, column: 28, scope: !4) +!28 = !DILocation(line: 45, column: 25, scope: !4) +!29 = !DILocation(line: 45, column: 36, scope: !4) +!30 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c03d0e81c89d7deb942021b08244255d56f89b37 --- /dev/null +++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ptx @@ -0,0 +1,557 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_0 +.visible .entry triton_red_fused__fused_rms_norm_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<3>; + .reg .b16 %rs<17>; + .reg .b32 %r<81>; + .reg .b64 %rd<8>; + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 + +// %bb.0: + ld.param.b64 %rd6, [triton_red_fused__fused_rms_norm_view_0_param_0]; + ld.param.b64 %rd7, [triton_red_fused__fused_rms_norm_view_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28 + mov.u32 %r11, %ctaid.x; + .loc 1 23 33 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33 + shl.b32 %r12, %r11, 6; + .loc 1 24 44 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44 + mov.u32 %r13, %tid.x; + and.b32 %r14, %r13, 504; + bfe.u32 %r15, %r13, 3, 6; + .loc 1 24 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23 + or.b32 %r16, %r15, %r12; + .loc 1 26 37 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37 + shl.b32 %r17, %r13, 3; + and.b32 %r18, %r17, 56; + .loc 1 29 19 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19 + bfe.s32 %r19, %r11, 25, 1; + shr.u32 %r20, %r19, 27; + add.s32 %r21, %r16, %r20; + shr.u32 %r22, %r21, 5; + .loc 1 28 19 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:28:19 + and.b32 %r23, %r21, 33554400; + sub.s32 %r24, %r16, %r23; + .loc 1 38 45 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:45 + shl.b32 %r25, %r24, 7; + or.b32 %r26, %r25, %r18; + mad.lo.s32 %r27, %r22, 12288, %r26; + .loc 1 38 34 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34 + mad.wide.s32 %rd1, %r27, 2, %rd6; + .loc 1 38 61 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + .loc 1 38 34 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34 + add.s64 %rd3, %rd1, 128; + .loc 1 38 61 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r28, %rs1; + cvt.f32.bf16 %r29, %rs2; + mov.b32 {%rs3, %rs4}, %r6; + cvt.f32.bf16 %r30, %rs4; + cvt.f32.bf16 %r31, %rs3; + .loc 1 40 22 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22 + mul.f32 %r32, %r31, %r31; + mul.f32 %r33, %r30, %r30; + .loc 1 42 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23 + fma.rn.f32 %r34, %r29, %r29, %r33; + fma.rn.f32 %r35, %r28, %r28, %r32; + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + mov.b32 {%rs5, %rs6}, %r2; + cvt.f32.bf16 %r36, %rs5; + cvt.f32.bf16 %r37, %rs6; + mov.b32 {%rs7, %rs8}, %r7; + cvt.f32.bf16 %r38, %rs8; + cvt.f32.bf16 %r39, %rs7; + .loc 1 40 22 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22 + mul.f32 %r40, %r39, %r39; + mul.f32 %r41, %r38, %r38; + .loc 1 42 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23 + fma.rn.f32 %r42, %r37, %r37, %r41; + fma.rn.f32 %r43, %r36, %r36, %r40; + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + mov.b32 {%rs9, %rs10}, %r3; + cvt.f32.bf16 %r44, %rs9; + cvt.f32.bf16 %r45, %rs10; + mov.b32 {%rs11, %rs12}, %r8; + cvt.f32.bf16 %r46, %rs12; + cvt.f32.bf16 %r47, %rs11; + .loc 1 40 22 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22 + mul.f32 %r48, %r47, %r47; + mul.f32 %r49, %r46, %r46; + .loc 1 42 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23 + fma.rn.f32 %r50, %r45, %r45, %r49; + fma.rn.f32 %r51, %r44, %r44, %r48; + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + mov.b32 {%rs13, %rs14}, %r4; + cvt.f32.bf16 %r52, %rs13; + cvt.f32.bf16 %r53, %rs14; + mov.b32 {%rs15, %rs16}, %r9; + cvt.f32.bf16 %r54, %rs16; + cvt.f32.bf16 %r55, %rs15; + .loc 1 40 22 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22 + mul.f32 %r56, %r55, %r55; + mul.f32 %r57, %r54, %r54; + .loc 1 42 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23 + fma.rn.f32 %r58, %r53, %r53, %r57; + fma.rn.f32 %r59, %r52, %r52, %r56; + .loc 1 24 44 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44 + and.b32 %r60, %r13, 63; + .loc 1 24 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23 + or.b32 %r61, %r12, %r60; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r62, %r35, %r34; + add.f32 %r63, %r43, %r62; + add.f32 %r64, %r42, %r63; + add.f32 %r65, %r51, %r64; + add.f32 %r66, %r50, %r65; + add.f32 %r67, %r59, %r66; + add.f32 %r68, %r58, %r67; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r69, %r68, 4, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r70, %r68, %r69; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r71, %r70, 2, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r72, %r70, %r71; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r73, %r72, 1, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r74, %r72, %r73; +$L__tmp8: + .loc 1 44 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28 + shr.u32 %r75, %r14, 1; + mov.b32 %r76, global_smem; + add.s32 %r77, %r76, %r75; + st.shared.b32 [%r77], %r74; + bar.sync 0; + shl.b32 %r78, %r60, 2; + add.s32 %r79, %r76, %r78; + ld.shared.b32 %r10, [%r79]; + .loc 1 45 25 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25 + mad.wide.s32 %rd5, %r61, 4, %rd7; + .loc 1 45 36 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36 + and.b32 %r80, %r13, 448; + setp.eq.b32 %p2, %r80, 0; + // begin inline asm + @%p2 st.global.b32 [ %rd5 + 0 ], { %r10 }; + // end inline asm + .loc 1 45 4 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 118 +.b8 121 +.b8 116 +.b8 52 +.b8 50 +.b8 55 +.b8 51 +.b8 105 +.b8 117 +.b8 51 +.b8 51 +.b8 109 +.b8 112 +.b8 101 +.b8 101 +.b8 55 +.b8 104 +.b8 98 +.b8 101 +.b8 116 +.b8 53 +.b8 106 +.b8 53 +.b8 101 +.b8 113 +.b8 52 +.b8 52 +.b8 100 +.b8 54 +.b8 102 +.b8 115 +.b8 104 +.b8 103 +.b8 119 +.b8 107 +.b8 121 +.b8 120 +.b8 107 +.b8 110 +.b8 53 +.b8 50 +.b8 103 +.b8 103 +.b8 103 +.b8 107 +.b8 105 +.b8 113 +.b8 104 +.b8 106 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 119 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.source b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..2a7953b6cf9c33c70bd81db34013741eeb793d5f --- /dev/null +++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 8192 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 64 : i32 loc(#loc49) + %xoffset_3 = arith.constant 64 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<64x64xi1> loc(#loc53) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c64_i32 = arith.constant 64 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x64xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<64x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<64x64xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<64x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<64x64xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<64x64xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<64x64xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<64x64xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<64x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x64xf32> loc("input"(#loc33))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc34) + tt.return %0 : tensor<64xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc37) + tt.return %1 : tensor<64xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..255ea11b76b6462c29eaa54e156207acf664e4c7 --- /dev/null +++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttgir @@ -0,0 +1,120 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [16, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc1 = loc(unknown) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc30 = loc("in_ptr0"(#loc)) +#loc31 = loc("out_ptr0"(#loc)) +#loc32 = loc("xnumel"(#loc)) +#loc33 = loc("r0_numel"(#loc)) +#loc54 = loc("tmp4"(#loc24)) +#loc57 = loc(callsite(#loc1 at #loc54)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36) + %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc36) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc36) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc37) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc37) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc37) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc37) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc38) + %x0 = arith.remsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc39) + %x1 = arith.divsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc40) + %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc41) + %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc42) + %tmp0_15 = arith.muli %x1, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc43) + %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc44) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc45) + %_tmp4 = scf.for %_tmp4_20 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg5 = %cst_4) -> (tensor<64x64xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp4_20 : i32 -> tensor<1x64xi32, #blocked> loc(#loc47) + %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x64xi32, #blocked> loc(#loc47) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst : tensor<1x64xi32, #blocked> loc(#loc48) + %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc42) + %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<64x64xi32, #blocked> loc(#loc42) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<64x64xi32, #blocked> loc(#loc44) + %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc45) + %tmp0_26 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc49) + %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_3 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc49) + %tmp0_28 = arith.extf %tmp0_27 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc50) + %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<64x64xf32, #blocked> loc(#loc51) + %tmp5 = arith.addf %arg5, %tmp2 : tensor<64x64xf32, #blocked> loc(#loc52) + %_tmp4_29 = arith.select %tmp0_26, %tmp5, %arg5 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc53) + scf.yield %_tmp4_29 : tensor<64x64xf32, #blocked> loc(#loc22) + } loc(#loc46) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))): + %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58) + tt.reduce.return %tmp4_22 : f32 loc(#loc56) + }) : (tensor<64x64xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56) + %tmp4_18 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55) + %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc55) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc27) + %1 = tt.addptr %0, %xindex_12 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc27) + tt.store %1, %tmp4_19 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc28) + tt.return loc(#loc29) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc34 = loc("xoffset"(#loc2)) +#loc35 = loc("xoffset"(#loc3)) +#loc36 = loc("xindex"(#loc4)) +#loc37 = loc("xindex"(#loc5)) +#loc38 = loc("r0_base"(#loc6)) +#loc39 = loc("x0"(#loc7)) +#loc40 = loc("x1"(#loc8)) +#loc41 = loc("tmp0"(#loc9)) +#loc42 = loc("tmp0"(#loc10)) +#loc43 = loc("tmp0"(#loc11)) +#loc44 = loc("tmp0"(#loc12)) +#loc45 = loc("tmp0"(#loc13)) +#loc46 = loc("_tmp4"(#loc14)) +#loc47 = loc("r0_index"(#loc15)) +#loc48 = loc("r0_mask"(#loc16)) +#loc49 = loc("tmp0"(#loc17)) +#loc50 = loc("tmp0"(#loc18)) +#loc51 = loc("tmp2"(#loc19)) +#loc52 = loc("tmp5"(#loc20)) +#loc53 = loc("_tmp4"(#loc21)) +#loc55 = loc("tmp4"(#loc26)) +#loc56 = loc(callsite(#loc23 at #loc54)) +#loc58 = loc(callsite(#loc25 at #loc56)) diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..909ab5dbd9c02d30c2f1beef4288872b0157a567 --- /dev/null +++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttir @@ -0,0 +1,114 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc1 = loc(unknown) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc31 = loc("in_ptr0"(#loc)) +#loc32 = loc("out_ptr0"(#loc)) +#loc33 = loc("xnumel"(#loc)) +#loc34 = loc("r0_numel"(#loc)) +#loc56 = loc("tmp4"(#loc25)) +#loc59 = loc(callsite(#loc1 at #loc56)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc35) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc36) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc37) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc38) + %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc39) + %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc39) + %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc40) + %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc41) + %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc42) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4_10 = %cst_3) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc44) + %r0_index_11 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc44) + %r0_mask = arith.cmpi slt, %r0_index_11, %cst_2 : tensor<1x64xi32> loc(#loc45) + %tmp0 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc46) + %tmp0_12 = tt.broadcast %r0_index_11 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc47) + %tmp0_13 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc47) + %tmp0_14 = arith.addi %tmp0_12, %tmp0_13 : tensor<64x64xi32> loc(#loc47) + %tmp0_15 = arith.muli %x1, %cst_0 : tensor<64x1xi32> loc(#loc48) + %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc49) + %tmp0_17 = arith.addi %tmp0_14, %tmp0_16 : tensor<64x64xi32> loc(#loc49) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc50) + %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc50) + %tmp0_20 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc51) + %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc51) + %tmp0_22 = arith.extf %tmp0_21 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc52) + %tmp2 = arith.mulf %tmp0_22, %tmp0_22 : tensor<64x64xf32> loc(#loc53) + %tmp5 = arith.addf %_tmp4_10, %tmp2 : tensor<64x64xf32> loc(#loc54) + %_tmp4_23 = arith.select %tmp0_20, %tmp5, %_tmp4_10 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc55) + scf.yield %_tmp4_23 : tensor<64x64xf32> loc(#loc23) + } loc(#loc43) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_10: f32 loc(callsite(#loc1 at #loc56)), %tmp4_11: f32 loc(callsite(#loc1 at #loc56))): + %tmp4_12 = arith.addf %tmp4_10, %tmp4_11 : f32 loc(#loc60) + tt.reduce.return %tmp4_12 : f32 loc(#loc58) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc58) + %tmp4_9 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc57) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc28) + %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc28) + tt.store %1, %tmp4_9 : tensor<64x1x!tt.ptr> loc(#loc29) + tt.return loc(#loc30) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc35 = loc("xoffset"(#loc3)) +#loc36 = loc("xoffset"(#loc4)) +#loc37 = loc("xindex"(#loc5)) +#loc38 = loc("xindex"(#loc6)) +#loc39 = loc("xindex"(#loc7)) +#loc40 = loc("r0_base"(#loc8)) +#loc41 = loc("x0"(#loc9)) +#loc42 = loc("x1"(#loc10)) +#loc43 = loc("_tmp4"(#loc2)) +#loc44 = loc("r0_index"(#loc11)) +#loc45 = loc("r0_mask"(#loc12)) +#loc46 = loc("tmp0"(#loc13)) +#loc47 = loc("tmp0"(#loc14)) +#loc48 = loc("tmp0"(#loc15)) +#loc49 = loc("tmp0"(#loc16)) +#loc50 = loc("tmp0"(#loc17)) +#loc51 = loc("tmp0"(#loc18)) +#loc52 = loc("tmp0"(#loc19)) +#loc53 = loc("tmp2"(#loc20)) +#loc54 = loc("tmp5"(#loc21)) +#loc55 = loc("_tmp4"(#loc22)) +#loc57 = loc("tmp4"(#loc27)) +#loc58 = loc(callsite(#loc24 at #loc56)) +#loc60 = loc(callsite(#loc26 at #loc58)) diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4b2bbd2d1a6fccfd3d25ff93d6b5a32b48f6f085 --- /dev/null +++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}} \ No newline at end of file diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d6fd60642c7725f162e1ad21c3e7dd4beba88666 Binary files /dev/null and b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..67b189f832459795e1bef3a08aa031f33f933750 --- /dev/null +++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"hash": "043651654ec2bab0baff8ac3035fb3ffe2a4719f91333c269be422e8492d5bfa", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"} \ No newline at end of file diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..7a26bef63ab64db75114165032a7066765f0a56f --- /dev/null +++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir @@ -0,0 +1,908 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %13 = shl nuw i32 %12, 1, !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %15 = and i32 %14, 32, !dbg !10 + %.not = icmp eq i32 %15, 0, !dbg !10 + %.lobit = lshr exact i32 %15, 5, !dbg !10 + %16 = and i32 %14, 1, !dbg !10 + %.not1 = icmp eq i32 %16, 0, !dbg !10 + %17 = or disjoint i32 %.lobit, %13, !dbg !11 + %18 = or disjoint i32 %13, %16, !dbg !11 + %19 = and i32 %14, 31, !dbg !12 + %20 = shl nuw nsw i32 %19, 2, !dbg !12 + %21 = shl nuw nsw i32 %14, 1, !dbg !12 + %22 = and i32 %21, 126, !dbg !12 + %23 = and i32 %14, 62, !dbg !12 + %24 = lshr i32 %14, 1, !dbg !12 + %25 = sdiv i32 %17, 32, !dbg !13 + %26 = mul i32 %25, 32, !dbg !14 + %.decomposed = sub i32 %17, %26, !dbg !14 + %27 = sdiv i32 %18, 32, !dbg !13 + %28 = or disjoint i32 %20, 4096, !dbg !15 + %29 = shl nsw i32 %.decomposed, 7, !dbg !16 + %30 = add nsw i32 %28, %29, !dbg !17 + %31 = mul i32 %25, 36864, !dbg !18 + %32 = add i32 %30, %31, !dbg !19 + %33 = sext i32 %32 to i64, !dbg !20 + %34 = getelementptr bfloat, ptr addrspace(1) %2, i64 %33, !dbg !20 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %36 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %34, i64 %35, i1 true) #6, !dbg !21 + %37 = extractvalue { i32, i32 } %36, 0, !dbg !21 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !21 + %39 = extractvalue { i32, i32 } %36, 1, !dbg !21 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !21 + %41 = extractelement <2 x bfloat> %38, i64 0, !dbg !21 + %42 = extractelement <2 x bfloat> %38, i64 1, !dbg !21 + %43 = extractelement <2 x bfloat> %40, i64 0, !dbg !21 + %44 = extractelement <2 x bfloat> %40, i64 1, !dbg !21 + %45 = fpext bfloat %41 to float, !dbg !22 + %46 = fpext bfloat %42 to float, !dbg !22 + %47 = fpext bfloat %43 to float, !dbg !22 + %48 = fpext bfloat %44 to float, !dbg !22 + %49 = or disjoint i32 %29, %20, !dbg !23 + %50 = add i32 %49, %31, !dbg !24 + %51 = sext i32 %50 to i64, !dbg !25 + %52 = getelementptr bfloat, ptr addrspace(1) %2, i64 %51, !dbg !25 + %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !26 + %54 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %52, i64 %53, i1 true) #6, !dbg !26 + %55 = extractvalue { i32, i32 } %54, 0, !dbg !26 + %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !26 + %57 = extractvalue { i32, i32 } %54, 1, !dbg !26 + %58 = bitcast i32 %57 to <2 x bfloat>, !dbg !26 + %59 = extractelement <2 x bfloat> %56, i64 0, !dbg !26 + %60 = extractelement <2 x bfloat> %56, i64 1, !dbg !26 + %61 = extractelement <2 x bfloat> %58, i64 0, !dbg !26 + %62 = extractelement <2 x bfloat> %58, i64 1, !dbg !26 + %63 = fpext bfloat %59 to float, !dbg !27 + %64 = fpext bfloat %60 to float, !dbg !27 + %65 = fpext bfloat %61 to float, !dbg !27 + %66 = fpext bfloat %62 to float, !dbg !27 + %67 = fmul float %45, %45, !dbg !28 + %68 = fmul float %46, %46, !dbg !28 + %69 = fmul float %47, %47, !dbg !28 + %70 = fmul float %48, %48, !dbg !28 + %71 = fmul float %63, %63, !dbg !29 + %72 = fmul float %64, %64, !dbg !29 + %73 = fmul float %65, %65, !dbg !29 + %74 = fmul float %66, %66, !dbg !29 + %75 = fadd float %67, %68, !dbg !30 + %76 = fadd float %69, %75, !dbg !30 + %77 = fadd float %70, %76, !dbg !30 + %78 = bitcast float %77 to i32, !dbg !33 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 16, i32 31), !dbg !33 + %80 = bitcast i32 %79 to float, !dbg !33 + %81 = fadd float %77, %80, !dbg !30 + %82 = bitcast float %81 to i32, !dbg !33 + %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 8, i32 31), !dbg !33 + %84 = bitcast i32 %83 to float, !dbg !33 + %85 = fadd float %81, %84, !dbg !30 + %86 = bitcast float %85 to i32, !dbg !33 + %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 4, i32 31), !dbg !33 + %88 = bitcast i32 %87 to float, !dbg !33 + %89 = fadd float %85, %88, !dbg !30 + %90 = bitcast float %89 to i32, !dbg !33 + %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 2, i32 31), !dbg !33 + %92 = bitcast i32 %91 to float, !dbg !33 + %93 = fadd float %89, %92, !dbg !30 + %94 = bitcast float %93 to i32, !dbg !33 + %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 1, i32 31), !dbg !33 + %96 = bitcast i32 %95 to float, !dbg !33 + %97 = fadd float %93, %96, !dbg !30 + %98 = fadd float %71, %72, !dbg !36 + %99 = fadd float %73, %98, !dbg !36 + %100 = fadd float %74, %99, !dbg !36 + %101 = bitcast float %100 to i32, !dbg !37 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 16, i32 31), !dbg !37 + %103 = bitcast i32 %102 to float, !dbg !37 + %104 = fadd float %100, %103, !dbg !36 + %105 = bitcast float %104 to i32, !dbg !37 + %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 8, i32 31), !dbg !37 + %107 = bitcast i32 %106 to float, !dbg !37 + %108 = fadd float %104, %107, !dbg !36 + %109 = bitcast float %108 to i32, !dbg !37 + %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 4, i32 31), !dbg !37 + %111 = bitcast i32 %110 to float, !dbg !37 + %112 = fadd float %108, %111, !dbg !36 + %113 = bitcast float %112 to i32, !dbg !37 + %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 2, i32 31), !dbg !37 + %115 = bitcast i32 %114 to float, !dbg !37 + %116 = fadd float %112, %115, !dbg !36 + %117 = bitcast float %116 to i32, !dbg !37 + %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !37 + %119 = bitcast i32 %118 to float, !dbg !37 + %120 = fadd float %116, %119, !dbg !36 + %121 = and i32 %24, 1, !dbg !39 + %122 = zext nneg i32 %22 to i64, !dbg !40 + %123 = getelementptr bfloat, ptr addrspace(1) %3, i64 %122, !dbg !40 + %124 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41 + %125 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %123, i64 %124, i1 true) #6, !dbg !41 + %126 = bitcast i32 %125 to <2 x bfloat>, !dbg !41 + %127 = fpext <2 x bfloat> %126 to <2 x float>, !dbg !42 + %128 = shl i32 %25, 7, !dbg !43 + %129 = or disjoint i32 %128, %20, !dbg !44 + %130 = sext i32 %129 to i64, !dbg !45 + %131 = getelementptr float, ptr addrspace(1) %4, i64 %130, !dbg !45 + %132 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !46 + %133 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %131, i64 %132, i1 true) #6, !dbg !46 + %134 = extractvalue { i32, i32, i32, i32 } %133, 0, !dbg !46 + %135 = extractvalue { i32, i32, i32, i32 } %133, 1, !dbg !46 + %136 = extractvalue { i32, i32, i32, i32 } %133, 2, !dbg !46 + %137 = extractvalue { i32, i32, i32, i32 } %133, 3, !dbg !46 + %138 = bitcast i32 %134 to float, !dbg !46 + %139 = bitcast i32 %135 to float, !dbg !46 + %140 = bitcast i32 %136 to float, !dbg !46 + %141 = bitcast i32 %137 to float, !dbg !46 + %142 = shl nuw nsw i32 %14, 4, !dbg !46 + %143 = and i32 %142, 112, !dbg !46 + %144 = and i32 %14, 24, !dbg !46 + %145 = lshr exact i32 %144, 1, !dbg !46 + %146 = select i1 %.not, i32 0, i32 192, !dbg !46 + %147 = or disjoint i32 %143, %145, !dbg !46 + %148 = xor i32 %147, %146, !dbg !46 + %149 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %148, !dbg !46 + %150 = insertelement <1 x i32> poison, i32 %134, i64 0, !dbg !46 + store <1 x i32> %150, ptr addrspace(3) %149, align 4, !dbg !46 + %151 = xor i32 %148, 260, !dbg !46 + %152 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %151, !dbg !46 + %153 = insertelement <1 x i32> poison, i32 %135, i64 0, !dbg !46 + store <1 x i32> %153, ptr addrspace(3) %152, align 4, !dbg !46 + %154 = xor i32 %148, 520, !dbg !46 + %155 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %154, !dbg !46 + %156 = insertelement <1 x i32> poison, i32 %136, i64 0, !dbg !46 + store <1 x i32> %156, ptr addrspace(3) %155, align 4, !dbg !46 + %157 = xor i32 %148, 780, !dbg !46 + %158 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %157, !dbg !46 + %159 = insertelement <1 x i32> poison, i32 %137, i64 0, !dbg !46 + store <1 x i32> %159, ptr addrspace(3) %158, align 4, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %160 = shl nuw nsw i32 %14, 7, !dbg !46 + %161 = and i32 %160, 768, !dbg !46 + %162 = shl nuw nsw i32 %23, 1, !dbg !46 + %163 = select i1 %.not1, i32 0, i32 192, !dbg !46 + %164 = xor i32 %163, %162, !dbg !46 + %165 = or disjoint i32 %164, %161, !dbg !46 + %166 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %165, !dbg !46 + %167 = load float, ptr addrspace(3) %166, align 4, !dbg !46 + %168 = xor i32 %165, 4, !dbg !46 + %169 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %168, !dbg !46 + %170 = load float, ptr addrspace(3) %169, align 4, !dbg !46 + %171 = xor i32 %165, 8, !dbg !46 + %172 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %171, !dbg !46 + %173 = load float, ptr addrspace(3) %172, align 4, !dbg !46 + %174 = xor i32 %165, 12, !dbg !46 + %175 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %174, !dbg !46 + %176 = load float, ptr addrspace(3) %175, align 4, !dbg !46 + %177 = getelementptr float, ptr addrspace(1) %5, i64 %130, !dbg !47 + %178 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48 + %179 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %177, i64 %178, i1 true) #6, !dbg !48 + %180 = extractvalue { i32, i32, i32, i32 } %179, 0, !dbg !48 + %181 = extractvalue { i32, i32, i32, i32 } %179, 1, !dbg !48 + %182 = extractvalue { i32, i32, i32, i32 } %179, 2, !dbg !48 + %183 = extractvalue { i32, i32, i32, i32 } %179, 3, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %184 = insertelement <1 x i32> poison, i32 %180, i64 0, !dbg !48 + store <1 x i32> %184, ptr addrspace(3) %149, align 4, !dbg !48 + %185 = insertelement <1 x i32> poison, i32 %181, i64 0, !dbg !48 + store <1 x i32> %185, ptr addrspace(3) %152, align 4, !dbg !48 + %186 = insertelement <1 x i32> poison, i32 %182, i64 0, !dbg !48 + store <1 x i32> %186, ptr addrspace(3) %155, align 4, !dbg !48 + %187 = insertelement <1 x i32> poison, i32 %183, i64 0, !dbg !48 + store <1 x i32> %187, ptr addrspace(3) %158, align 4, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %188 = load float, ptr addrspace(3) %166, align 4, !dbg !48 + %189 = load float, ptr addrspace(3) %169, align 4, !dbg !48 + %190 = load float, ptr addrspace(3) %172, align 4, !dbg !48 + %191 = load float, ptr addrspace(3) %175, align 4, !dbg !48 + %192 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %193 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %34, i64 %192, i1 true) #6, !dbg !49 + %194 = getelementptr bfloat, ptr addrspace(1) %6, i64 %122, !dbg !50 + %195 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %196 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %194, i64 %195, i1 true) #6, !dbg !51 + %197 = icmp eq i32 %121, 0, !dbg !52 + %198 = and i32 %24, 30, !dbg !53 + %199 = or disjoint i32 %198, 32, !dbg !53 + %200 = or disjoint i32 %198, 64, !dbg !53 + %201 = or disjoint i32 %198, 96, !dbg !53 + %202 = or disjoint i32 %198, 1, !dbg !54 + %203 = or disjoint i32 %198, 33, !dbg !54 + %204 = or disjoint i32 %198, 65, !dbg !54 + %205 = or disjoint i32 %198, 97, !dbg !54 + %206 = shl i32 %18, 7, !dbg !55 + %207 = shl i32 %27, 15, !dbg !55 + %208 = add i32 %207, %206, !dbg !55 + %209 = or disjoint i32 %208, %202, !dbg !56 + %210 = or disjoint i32 %208, %203, !dbg !56 + %211 = or disjoint i32 %208, %204, !dbg !56 + %212 = or disjoint i32 %208, %205, !dbg !56 + %213 = sext i32 %209 to i64, !dbg !57 + %214 = getelementptr bfloat, ptr addrspace(1) %2, i64 %213, !dbg !57 + %215 = sext i32 %210 to i64, !dbg !57 + %216 = getelementptr bfloat, ptr addrspace(1) %2, i64 %215, !dbg !57 + %217 = sext i32 %211 to i64, !dbg !57 + %218 = getelementptr bfloat, ptr addrspace(1) %2, i64 %217, !dbg !57 + %219 = sext i32 %212 to i64, !dbg !57 + %220 = getelementptr bfloat, ptr addrspace(1) %2, i64 %219, !dbg !57 + %221 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %222 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %214, i64 %221, i1 %197) #6, !dbg !58 + %223 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %224 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %216, i64 %223, i1 %197) #6, !dbg !58 + %225 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %226 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %218, i64 %225, i1 %197) #6, !dbg !58 + %227 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %228 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %220, i64 %227, i1 %197) #6, !dbg !58 + %229 = tail call float @llvm.nvvm.div.full(float %120, float 1.280000e+02), !dbg !59 + %230 = fadd float %229, 0x3EB0C6F7A0000000, !dbg !60 + %231 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not.i = icmp eq i32 %231, 0, !dbg !61 + br i1 %.not.i, label %234, label %232, !dbg !61 + +232: ; preds = %11 + %233 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %230), !dbg !61 + br label %__nv_rsqrtf.exit, !dbg !61 + +234: ; preds = %11 + %235 = tail call float @llvm.nvvm.rsqrt.approx.f(float %230), !dbg !61 + br label %__nv_rsqrtf.exit, !dbg !61 + +__nv_rsqrtf.exit: ; preds = %232, %234 + %.0.i = phi float [ %233, %232 ], [ %235, %234 ], !dbg !61 + %236 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not.i15 = icmp eq i32 %238, 0, !dbg !61 + br i1 %.not.i15, label %241, label %239, !dbg !61 + +239: ; preds = %__nv_rsqrtf.exit + %240 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %230), !dbg !61 + br label %__nv_rsqrtf.exit17, !dbg !61 + +241: ; preds = %__nv_rsqrtf.exit + %242 = tail call float @llvm.nvvm.rsqrt.approx.f(float %230), !dbg !61 + br label %__nv_rsqrtf.exit17, !dbg !61 + +__nv_rsqrtf.exit17: ; preds = %239, %241 + %.0.i16 = phi float [ %240, %239 ], [ %242, %241 ], !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62 + %243 = lshr exact i32 %15, 3, !dbg !62 + %244 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %243, !dbg !62 + store float %.0.i, ptr addrspace(3) %244, align 4, !dbg !62 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62 + %245 = shl nuw nsw i32 %16, 2, !dbg !62 + %246 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %245, !dbg !62 + %247 = load float, ptr addrspace(3) %246, align 4, !dbg !62 + %248 = zext nneg i32 %202 to i64, !dbg !63 + %249 = getelementptr bfloat, ptr addrspace(1) %3, i64 %248, !dbg !63 + %250 = zext nneg i32 %203 to i64, !dbg !63 + %251 = getelementptr bfloat, ptr addrspace(1) %3, i64 %250, !dbg !63 + %252 = zext nneg i32 %204 to i64, !dbg !63 + %253 = getelementptr bfloat, ptr addrspace(1) %3, i64 %252, !dbg !63 + %254 = zext nneg i32 %205 to i64, !dbg !63 + %255 = getelementptr bfloat, ptr addrspace(1) %3, i64 %254, !dbg !63 + %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %257 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %249, i64 %256, i1 %197) #6, !dbg !64 + %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %251, i64 %258, i1 %197) #6, !dbg !64 + %260 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %261 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %253, i64 %260, i1 %197) #6, !dbg !64 + %262 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %263 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %255, i64 %262, i1 %197) #6, !dbg !64 + %264 = icmp ne i32 %121, 0, !dbg !65 + %265 = or disjoint i32 %208, %198, !dbg !66 + %266 = or disjoint i32 %208, %199, !dbg !66 + %267 = or disjoint i32 %208, %200, !dbg !66 + %268 = or disjoint i32 %208, %201, !dbg !66 + %269 = sext i32 %265 to i64, !dbg !67 + %270 = getelementptr bfloat, ptr addrspace(1) %2, i64 %269, !dbg !67 + %271 = sext i32 %266 to i64, !dbg !67 + %272 = getelementptr bfloat, ptr addrspace(1) %2, i64 %271, !dbg !67 + %273 = sext i32 %267 to i64, !dbg !67 + %274 = getelementptr bfloat, ptr addrspace(1) %2, i64 %273, !dbg !67 + %275 = sext i32 %268 to i64, !dbg !67 + %276 = getelementptr bfloat, ptr addrspace(1) %2, i64 %275, !dbg !67 + %277 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %278 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %270, i64 %277, i1 %264) #6, !dbg !68 + %279 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %280 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %272, i64 %279, i1 %264) #6, !dbg !68 + %281 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %282 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %274, i64 %281, i1 %264) #6, !dbg !68 + %283 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %284 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %276, i64 %283, i1 %264) #6, !dbg !68 + %285 = zext nneg i32 %198 to i64, !dbg !69 + %286 = getelementptr bfloat, ptr addrspace(1) %3, i64 %285, !dbg !69 + %287 = zext nneg i32 %199 to i64, !dbg !69 + %288 = getelementptr bfloat, ptr addrspace(1) %3, i64 %287, !dbg !69 + %289 = zext nneg i32 %200 to i64, !dbg !69 + %290 = getelementptr bfloat, ptr addrspace(1) %3, i64 %289, !dbg !69 + %291 = zext nneg i32 %201 to i64, !dbg !69 + %292 = getelementptr bfloat, ptr addrspace(1) %3, i64 %291, !dbg !69 + %293 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %293, i1 %264) #6, !dbg !70 + %295 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %296 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %295, i1 %264) #6, !dbg !70 + %297 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %298 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %297, i1 %264) #6, !dbg !70 + %299 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %300 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %292, i64 %299, i1 %264) #6, !dbg !70 + %301 = fmul float %.0.i16, %63, !dbg !71 + %302 = fmul float %.0.i16, %64, !dbg !71 + %303 = fmul float %.0.i16, %65, !dbg !71 + %304 = fmul float %.0.i16, %66, !dbg !71 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !72 + %305 = shl nuw nsw i32 %23, 2, !dbg !72 + %306 = select i1 %.not1, i32 0, i32 320, !dbg !72 + %307 = xor i32 %306, %305, !dbg !72 + %308 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %307, !dbg !72 + store <2 x float> %127, ptr addrspace(3) %308, align 8, !dbg !72 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !72 + %309 = shl nuw nsw i32 %19, 3, !dbg !72 + %310 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %309, !dbg !72 + %311 = load float, ptr addrspace(3) %310, align 8, !dbg !72 + %312 = getelementptr inbounds nuw i8, ptr addrspace(3) %310, i32 4, !dbg !72 + %313 = load float, ptr addrspace(3) %312, align 4, !dbg !72 + %314 = xor i32 %309, 320, !dbg !72 + %315 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %314, !dbg !72 + %316 = load float, ptr addrspace(3) %315, align 8, !dbg !72 + %317 = getelementptr inbounds nuw i8, ptr addrspace(3) %315, i32 4, !dbg !72 + %318 = load float, ptr addrspace(3) %317, align 4, !dbg !72 + %319 = fmul float %301, %311, !dbg !72 + %320 = fmul float %302, %313, !dbg !72 + %321 = fmul float %303, %316, !dbg !72 + %322 = fmul float %304, %318, !dbg !72 + %323 = fmul float %319, %138, !dbg !73 + %324 = fmul float %320, %139, !dbg !73 + %325 = fmul float %321, %140, !dbg !73 + %326 = fmul float %322, %141, !dbg !73 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73 + store float %323, ptr addrspace(3) %149, align 4, !dbg !73 + store float %324, ptr addrspace(3) %152, align 4, !dbg !73 + store float %325, ptr addrspace(3) %155, align 4, !dbg !73 + store float %326, ptr addrspace(3) %158, align 4, !dbg !73 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73 + %327 = load float, ptr addrspace(3) %166, align 4, !dbg !73 + %328 = load float, ptr addrspace(3) %169, align 4, !dbg !73 + %329 = load float, ptr addrspace(3) %172, align 4, !dbg !73 + %330 = load float, ptr addrspace(3) %175, align 4, !dbg !73 + %331 = add i32 %208, 4097, !dbg !74 + %332 = or disjoint i32 %331, %198, !dbg !75 + %333 = add i32 %208, 4129, !dbg !74 + %334 = or disjoint i32 %333, %198, !dbg !75 + %335 = add i32 %208, 4161, !dbg !74 + %336 = or disjoint i32 %335, %198, !dbg !75 + %337 = add i32 %208, 4193, !dbg !74 + %338 = or disjoint i32 %337, %198, !dbg !75 + %339 = sext i32 %332 to i64, !dbg !76 + %340 = getelementptr bfloat, ptr addrspace(1) %2, i64 %339, !dbg !76 + %341 = sext i32 %334 to i64, !dbg !76 + %342 = getelementptr bfloat, ptr addrspace(1) %2, i64 %341, !dbg !76 + %343 = sext i32 %336 to i64, !dbg !76 + %344 = getelementptr bfloat, ptr addrspace(1) %2, i64 %343, !dbg !76 + %345 = sext i32 %338 to i64, !dbg !76 + %346 = getelementptr bfloat, ptr addrspace(1) %2, i64 %345, !dbg !76 + %347 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77 + %348 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %340, i64 %347, i1 %197) #6, !dbg !77 + %349 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77 + %350 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %342, i64 %349, i1 %197) #6, !dbg !77 + %351 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77 + %352 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %344, i64 %351, i1 %197) #6, !dbg !77 + %353 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77 + %354 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %346, i64 %353, i1 %197) #6, !dbg !77 + %355 = tail call float @llvm.nvvm.div.full(float %97, float 1.280000e+02), !dbg !78 + %356 = fadd float %355, 0x3EB0C6F7A0000000, !dbg !79 + %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80 + %.not.i18 = icmp eq i32 %357, 0, !dbg !80 + br i1 %.not.i18, label %360, label %358, !dbg !80 + +358: ; preds = %__nv_rsqrtf.exit17 + %359 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %356), !dbg !80 + br label %__nv_rsqrtf.exit20, !dbg !80 + +360: ; preds = %__nv_rsqrtf.exit17 + %361 = tail call float @llvm.nvvm.rsqrt.approx.f(float %356), !dbg !80 + br label %__nv_rsqrtf.exit20, !dbg !80 + +__nv_rsqrtf.exit20: ; preds = %358, %360 + %.0.i19 = phi float [ %359, %358 ], [ %361, %360 ], !dbg !80 + %362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80 + %363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80 + %364 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80 + %.not.i27 = icmp eq i32 %364, 0, !dbg !80 + br i1 %.not.i27, label %367, label %365, !dbg !80 + +365: ; preds = %__nv_rsqrtf.exit20 + %366 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %356), !dbg !80 + br label %__nv_rsqrtf.exit29, !dbg !80 + +367: ; preds = %__nv_rsqrtf.exit20 + %368 = tail call float @llvm.nvvm.rsqrt.approx.f(float %356), !dbg !80 + br label %__nv_rsqrtf.exit29, !dbg !80 + +__nv_rsqrtf.exit29: ; preds = %365, %367 + %.0.i28 = phi float [ %366, %365 ], [ %368, %367 ], !dbg !80 + %369 = bitcast i16 %354 to bfloat, !dbg !77 + %370 = fpext bfloat %369 to float, !dbg !81 + %371 = bitcast i16 %352 to bfloat, !dbg !77 + %372 = fpext bfloat %371 to float, !dbg !81 + %373 = bitcast i16 %350 to bfloat, !dbg !77 + %374 = fpext bfloat %373 to float, !dbg !81 + %375 = bitcast i16 %348 to bfloat, !dbg !77 + %376 = fpext bfloat %375 to float, !dbg !81 + %377 = bitcast i16 %228 to bfloat, !dbg !58 + %378 = fpext bfloat %377 to float, !dbg !82 + %379 = fmul float %247, %378, !dbg !62 + %380 = bitcast i16 %263 to bfloat, !dbg !64 + %381 = fpext bfloat %380 to float, !dbg !83 + %382 = fmul float %379, %381, !dbg !84 + %383 = fsub float 0.000000e+00, %382, !dbg !85 + %384 = bitcast i16 %284 to bfloat, !dbg !68 + %385 = fpext bfloat %384 to float, !dbg !86 + %386 = fmul float %247, %385, !dbg !87 + %387 = bitcast i16 %300 to bfloat, !dbg !70 + %388 = fpext bfloat %387 to float, !dbg !88 + %389 = fmul float %386, %388, !dbg !89 + %390 = select i1 %197, float %383, float %389, !dbg !90 + %391 = fmul float %191, %390, !dbg !91 + %392 = fadd float %391, %330, !dbg !92 + %393 = bitcast i16 %226 to bfloat, !dbg !58 + %394 = fpext bfloat %393 to float, !dbg !82 + %395 = fmul float %247, %394, !dbg !62 + %396 = bitcast i16 %261 to bfloat, !dbg !64 + %397 = fpext bfloat %396 to float, !dbg !83 + %398 = fmul float %395, %397, !dbg !84 + %399 = fsub float 0.000000e+00, %398, !dbg !85 + %400 = bitcast i16 %282 to bfloat, !dbg !68 + %401 = fpext bfloat %400 to float, !dbg !86 + %402 = fmul float %247, %401, !dbg !87 + %403 = bitcast i16 %298 to bfloat, !dbg !70 + %404 = fpext bfloat %403 to float, !dbg !88 + %405 = fmul float %402, %404, !dbg !89 + %406 = select i1 %197, float %399, float %405, !dbg !90 + %407 = fmul float %190, %406, !dbg !91 + %408 = fadd float %407, %329, !dbg !92 + %409 = bitcast i16 %224 to bfloat, !dbg !58 + %410 = fpext bfloat %409 to float, !dbg !82 + %411 = fmul float %247, %410, !dbg !62 + %412 = bitcast i16 %259 to bfloat, !dbg !64 + %413 = fpext bfloat %412 to float, !dbg !83 + %414 = fmul float %411, %413, !dbg !84 + %415 = fsub float 0.000000e+00, %414, !dbg !85 + %416 = bitcast i16 %280 to bfloat, !dbg !68 + %417 = fpext bfloat %416 to float, !dbg !86 + %418 = fmul float %247, %417, !dbg !87 + %419 = bitcast i16 %296 to bfloat, !dbg !70 + %420 = fpext bfloat %419 to float, !dbg !88 + %421 = fmul float %418, %420, !dbg !89 + %422 = select i1 %197, float %415, float %421, !dbg !90 + %423 = fmul float %189, %422, !dbg !91 + %424 = fadd float %423, %328, !dbg !92 + %425 = bitcast i16 %222 to bfloat, !dbg !58 + %426 = fpext bfloat %425 to float, !dbg !82 + %427 = fmul float %247, %426, !dbg !62 + %428 = bitcast i16 %257 to bfloat, !dbg !64 + %429 = fpext bfloat %428 to float, !dbg !83 + %430 = fmul float %427, %429, !dbg !84 + %431 = fsub float 0.000000e+00, %430, !dbg !85 + %432 = bitcast i16 %278 to bfloat, !dbg !68 + %433 = fpext bfloat %432 to float, !dbg !86 + %434 = fmul float %247, %433, !dbg !87 + %435 = bitcast i16 %294 to bfloat, !dbg !70 + %436 = fpext bfloat %435 to float, !dbg !88 + %437 = fmul float %434, %436, !dbg !89 + %438 = select i1 %197, float %431, float %437, !dbg !90 + %439 = fmul float %188, %438, !dbg !91 + %440 = fadd float %439, %327, !dbg !92 + %441 = bitcast i32 %196 to <2 x bfloat>, !dbg !51 + %442 = extractelement <2 x bfloat> %441, i64 1, !dbg !51 + %443 = fpext bfloat %442 to float, !dbg !93 + %444 = extractelement <2 x bfloat> %441, i64 0, !dbg !51 + %445 = fpext bfloat %444 to float, !dbg !93 + %446 = extractvalue { i32, i32 } %193, 1, !dbg !49 + %447 = bitcast i32 %446 to <2 x bfloat>, !dbg !49 + %448 = extractelement <2 x bfloat> %447, i64 1, !dbg !49 + %449 = fpext bfloat %448 to float, !dbg !94 + %450 = extractelement <2 x bfloat> %447, i64 0, !dbg !49 + %451 = fpext bfloat %450 to float, !dbg !94 + %452 = extractvalue { i32, i32 } %193, 0, !dbg !49 + %453 = bitcast i32 %452 to <2 x bfloat>, !dbg !49 + %454 = extractelement <2 x bfloat> %453, i64 1, !dbg !49 + %455 = fpext bfloat %454 to float, !dbg !94 + %456 = extractelement <2 x bfloat> %453, i64 0, !dbg !49 + %457 = fpext bfloat %456 to float, !dbg !94 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !95 + store float %.0.i19, ptr addrspace(3) %244, align 4, !dbg !95 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !95 + %458 = load float, ptr addrspace(3) %246, align 4, !dbg !95 + %459 = fmul float %458, %376, !dbg !95 + %460 = fmul float %458, %374, !dbg !95 + %461 = fmul float %458, %372, !dbg !95 + %462 = fmul float %458, %370, !dbg !95 + %463 = getelementptr bfloat, ptr addrspace(1) %6, i64 %248, !dbg !96 + %464 = getelementptr bfloat, ptr addrspace(1) %6, i64 %250, !dbg !96 + %465 = getelementptr bfloat, ptr addrspace(1) %6, i64 %252, !dbg !96 + %466 = getelementptr bfloat, ptr addrspace(1) %6, i64 %254, !dbg !96 + %467 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %468 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %463, i64 %467, i1 %197) #6, !dbg !97 + %469 = bitcast i16 %468 to bfloat, !dbg !97 + %470 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %471 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %464, i64 %470, i1 %197) #6, !dbg !97 + %472 = bitcast i16 %471 to bfloat, !dbg !97 + %473 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %474 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %465, i64 %473, i1 %197) #6, !dbg !97 + %475 = bitcast i16 %474 to bfloat, !dbg !97 + %476 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %477 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %466, i64 %476, i1 %197) #6, !dbg !97 + %478 = bitcast i16 %477 to bfloat, !dbg !97 + %479 = fpext bfloat %469 to float, !dbg !98 + %480 = fpext bfloat %472 to float, !dbg !98 + %481 = fpext bfloat %475 to float, !dbg !98 + %482 = fpext bfloat %478 to float, !dbg !98 + %483 = fmul float %459, %479, !dbg !99 + %484 = fmul float %460, %480, !dbg !99 + %485 = fmul float %461, %481, !dbg !99 + %486 = fmul float %462, %482, !dbg !99 + %487 = fsub float 0.000000e+00, %483, !dbg !100 + %488 = fsub float 0.000000e+00, %484, !dbg !100 + %489 = fsub float 0.000000e+00, %485, !dbg !100 + %490 = fsub float 0.000000e+00, %486, !dbg !100 + %491 = add i32 %208, 4096, !dbg !101 + %492 = or disjoint i32 %491, %198, !dbg !102 + %493 = add i32 %208, 4128, !dbg !101 + %494 = or disjoint i32 %493, %198, !dbg !102 + %495 = add i32 %208, 4160, !dbg !101 + %496 = or disjoint i32 %495, %198, !dbg !102 + %497 = add i32 %208, 4192, !dbg !101 + %498 = or disjoint i32 %497, %198, !dbg !102 + %499 = sext i32 %492 to i64, !dbg !103 + %500 = getelementptr bfloat, ptr addrspace(1) %2, i64 %499, !dbg !103 + %501 = sext i32 %494 to i64, !dbg !103 + %502 = getelementptr bfloat, ptr addrspace(1) %2, i64 %501, !dbg !103 + %503 = sext i32 %496 to i64, !dbg !103 + %504 = getelementptr bfloat, ptr addrspace(1) %2, i64 %503, !dbg !103 + %505 = sext i32 %498 to i64, !dbg !103 + %506 = getelementptr bfloat, ptr addrspace(1) %2, i64 %505, !dbg !103 + %507 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104 + %508 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %500, i64 %507, i1 %264) #6, !dbg !104 + %509 = bitcast i16 %508 to bfloat, !dbg !104 + %510 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104 + %511 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %502, i64 %510, i1 %264) #6, !dbg !104 + %512 = bitcast i16 %511 to bfloat, !dbg !104 + %513 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104 + %514 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %504, i64 %513, i1 %264) #6, !dbg !104 + %515 = bitcast i16 %514 to bfloat, !dbg !104 + %516 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104 + %517 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %506, i64 %516, i1 %264) #6, !dbg !104 + %518 = bitcast i16 %517 to bfloat, !dbg !104 + %519 = fpext bfloat %509 to float, !dbg !105 + %520 = fpext bfloat %512 to float, !dbg !105 + %521 = fpext bfloat %515 to float, !dbg !105 + %522 = fpext bfloat %518 to float, !dbg !105 + %523 = fmul float %458, %519, !dbg !106 + %524 = fmul float %458, %520, !dbg !106 + %525 = fmul float %458, %521, !dbg !106 + %526 = fmul float %458, %522, !dbg !106 + %527 = getelementptr bfloat, ptr addrspace(1) %6, i64 %285, !dbg !107 + %528 = getelementptr bfloat, ptr addrspace(1) %6, i64 %287, !dbg !107 + %529 = getelementptr bfloat, ptr addrspace(1) %6, i64 %289, !dbg !107 + %530 = getelementptr bfloat, ptr addrspace(1) %6, i64 %291, !dbg !107 + %531 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108 + %532 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %527, i64 %531, i1 %264) #6, !dbg !108 + %533 = bitcast i16 %532 to bfloat, !dbg !108 + %534 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108 + %535 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %528, i64 %534, i1 %264) #6, !dbg !108 + %536 = bitcast i16 %535 to bfloat, !dbg !108 + %537 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108 + %538 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %529, i64 %537, i1 %264) #6, !dbg !108 + %539 = bitcast i16 %538 to bfloat, !dbg !108 + %540 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108 + %541 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %530, i64 %540, i1 %264) #6, !dbg !108 + %542 = bitcast i16 %541 to bfloat, !dbg !108 + %543 = fpext bfloat %533 to float, !dbg !109 + %544 = fpext bfloat %536 to float, !dbg !109 + %545 = fpext bfloat %539 to float, !dbg !109 + %546 = fpext bfloat %542 to float, !dbg !109 + %547 = fmul float %523, %543, !dbg !110 + %548 = fmul float %524, %544, !dbg !110 + %549 = fmul float %525, %545, !dbg !110 + %550 = fmul float %526, %546, !dbg !110 + %551 = select i1 %197, float %487, float %547, !dbg !90 + %552 = select i1 %197, float %488, float %548, !dbg !90 + %553 = select i1 %197, float %489, float %549, !dbg !90 + %554 = select i1 %197, float %490, float %550, !dbg !90 + %555 = fmul float %.0.i28, %457, !dbg !111 + %556 = fmul float %.0.i28, %455, !dbg !111 + %557 = fmul float %.0.i28, %451, !dbg !111 + %558 = fmul float %.0.i28, %449, !dbg !111 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !111 + store float %555, ptr addrspace(3) %149, align 4, !dbg !111 + store float %556, ptr addrspace(3) %152, align 4, !dbg !111 + store float %557, ptr addrspace(3) %155, align 4, !dbg !111 + store float %558, ptr addrspace(3) %158, align 4, !dbg !111 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !111 + %559 = load float, ptr addrspace(3) %166, align 4, !dbg !111 + %560 = load float, ptr addrspace(3) %169, align 4, !dbg !111 + %561 = load float, ptr addrspace(3) %172, align 4, !dbg !111 + %562 = load float, ptr addrspace(3) %175, align 4, !dbg !111 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112 + %563 = shl nuw nsw i32 %14, 3, !dbg !112 + %564 = and i32 %563, 120, !dbg !112 + %565 = lshr i32 %14, 2, !dbg !112 + %566 = and i32 %565, 4, !dbg !112 + %567 = shl nuw nsw i32 %15, 2, !dbg !112 + %568 = or disjoint i32 %566, %567, !dbg !112 + %569 = or disjoint i32 %568, %564, !dbg !112 + %570 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %569, !dbg !112 + store float %445, ptr addrspace(3) %570, align 4, !dbg !112 + %571 = xor i32 %569, 320, !dbg !112 + %572 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %571, !dbg !112 + store float %443, ptr addrspace(3) %572, align 4, !dbg !112 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112 + %573 = and i32 %21, 120, !dbg !112 + %574 = and i32 %14, 2, !dbg !112 + %575 = icmp eq i32 %574, 0, !dbg !112 + %576 = select i1 %575, i32 0, i32 320, !dbg !112 + %577 = xor i32 %576, %573, !dbg !112 + %578 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %577, !dbg !112 + %579 = load float, ptr addrspace(3) %578, align 8, !dbg !112 + %580 = getelementptr inbounds nuw i8, ptr addrspace(3) %578, i32 4, !dbg !112 + %581 = load float, ptr addrspace(3) %580, align 4, !dbg !112 + %582 = getelementptr inbounds nuw i8, ptr addrspace(3) %578, i32 128, !dbg !112 + %583 = load float, ptr addrspace(3) %582, align 8, !dbg !112 + %584 = getelementptr inbounds nuw i8, ptr addrspace(3) %578, i32 132, !dbg !112 + %585 = load float, ptr addrspace(3) %584, align 4, !dbg !112 + %586 = fmul float %559, %579, !dbg !113 + %587 = fmul float %560, %581, !dbg !113 + %588 = fmul float %561, %583, !dbg !113 + %589 = fmul float %562, %585, !dbg !113 + %590 = fmul float %167, %586, !dbg !112 + %591 = fmul float %170, %587, !dbg !112 + %592 = fmul float %173, %588, !dbg !112 + %593 = fmul float %176, %589, !dbg !112 + %594 = fmul float %188, %551, !dbg !114 + %595 = fmul float %189, %552, !dbg !114 + %596 = fmul float %190, %553, !dbg !114 + %597 = fmul float %191, %554, !dbg !114 + %598 = fadd float %594, %590, !dbg !115 + %599 = fadd float %595, %591, !dbg !115 + %600 = fadd float %596, %592, !dbg !115 + %601 = fadd float %597, %593, !dbg !115 + %602 = shl i32 %17, 7, !dbg !116 + %603 = or disjoint i32 %602, %20, !dbg !117 + %604 = sext i32 %603 to i64, !dbg !118 + %605 = getelementptr bfloat, ptr addrspace(1) %0, i64 %604, !dbg !118 + %606 = fptrunc float %440 to bfloat, !dbg !119 + %607 = fptrunc float %424 to bfloat, !dbg !119 + %608 = fptrunc float %408 to bfloat, !dbg !119 + %609 = fptrunc float %392 to bfloat, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + %610 = and i32 %14, 3, !dbg !119 + %611 = and i32 %14, 28, !dbg !119 + %612 = lshr exact i32 %15, 4, !dbg !119 + %613 = mul nuw nsw i32 %610, 160, !dbg !119 + %614 = or disjoint i32 %613, %612, !dbg !119 + %615 = or disjoint i32 %614, %611, !dbg !119 + %616 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %615, !dbg !119 + store bfloat %606, ptr addrspace(3) %616, align 2, !dbg !119 + %617 = xor i32 %615, 32, !dbg !119 + %618 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %617, !dbg !119 + store bfloat %607, ptr addrspace(3) %618, align 2, !dbg !119 + %619 = xor i32 %615, 64, !dbg !119 + %620 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %619, !dbg !119 + store bfloat %608, ptr addrspace(3) %620, align 2, !dbg !119 + %621 = xor i32 %615, 96, !dbg !119 + %622 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %621, !dbg !119 + store bfloat %609, ptr addrspace(3) %622, align 2, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + %623 = shl nuw nsw i32 %610, 3, !dbg !119 + %624 = shl nuw nsw i32 %144, 2, !dbg !119 + %625 = and i32 %24, 2, !dbg !119 + %626 = select i1 %.not, i32 0, i32 160, !dbg !119 + %627 = or disjoint i32 %623, %624, !dbg !119 + %628 = xor i32 %627, %626, !dbg !119 + %629 = or disjoint i32 %628, %625, !dbg !119 + %630 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %629, !dbg !119 + %631 = load bfloat, ptr addrspace(3) %630, align 2, !dbg !119 + %632 = getelementptr inbounds nuw i8, ptr addrspace(3) %630, i32 4, !dbg !119 + %633 = load bfloat, ptr addrspace(3) %632, align 2, !dbg !119 + %634 = xor i32 %629, 320, !dbg !119 + %635 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %634, !dbg !119 + %636 = load bfloat, ptr addrspace(3) %635, align 2, !dbg !119 + %637 = getelementptr inbounds nuw i8, ptr addrspace(3) %635, i32 4, !dbg !119 + %638 = load bfloat, ptr addrspace(3) %637, align 2, !dbg !119 + %639 = insertelement <2 x bfloat> poison, bfloat %631, i64 0, !dbg !119 + %640 = insertelement <2 x bfloat> %639, bfloat %636, i64 1, !dbg !119 + %641 = bitcast <2 x bfloat> %640 to i32, !dbg !119 + %642 = insertelement <2 x bfloat> poison, bfloat %633, i64 0, !dbg !119 + %643 = insertelement <2 x bfloat> %642, bfloat %638, i64 1, !dbg !119 + %644 = bitcast <2 x bfloat> %643 to i32, !dbg !119 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %641, i32 %644, ptr addrspace(1) %605, i1 true) #6, !dbg !119 + %645 = getelementptr bfloat, ptr addrspace(1) %1, i64 %604, !dbg !120 + %646 = fptrunc float %598 to bfloat, !dbg !121 + %647 = fptrunc float %599 to bfloat, !dbg !121 + %648 = fptrunc float %600 to bfloat, !dbg !121 + %649 = fptrunc float %601 to bfloat, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + store bfloat %646, ptr addrspace(3) %616, align 2, !dbg !121 + store bfloat %647, ptr addrspace(3) %618, align 2, !dbg !121 + store bfloat %648, ptr addrspace(3) %620, align 2, !dbg !121 + store bfloat %649, ptr addrspace(3) %622, align 2, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + %650 = load bfloat, ptr addrspace(3) %630, align 2, !dbg !121 + %651 = load bfloat, ptr addrspace(3) %632, align 2, !dbg !121 + %652 = load bfloat, ptr addrspace(3) %635, align 2, !dbg !121 + %653 = load bfloat, ptr addrspace(3) %637, align 2, !dbg !121 + %654 = insertelement <2 x bfloat> poison, bfloat %650, i64 0, !dbg !121 + %655 = insertelement <2 x bfloat> %654, bfloat %652, i64 1, !dbg !121 + %656 = bitcast <2 x bfloat> %655 to i32, !dbg !121 + %657 = insertelement <2 x bfloat> poison, bfloat %651, i64 0, !dbg !121 + %658 = insertelement <2 x bfloat> %657, bfloat %653, i64 1, !dbg !121 + %659 = bitcast <2 x bfloat> %658 to i32, !dbg !121 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %656, i32 %659, ptr addrspace(1) %645, i1 true) #6, !dbg !121 + ret void, !dbg !122 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 23, column: 33, scope: !5) +!10 = !DILocation(line: 24, column: 44, scope: !5) +!11 = !DILocation(line: 24, column: 23, scope: !5) +!12 = !DILocation(line: 26, column: 37, scope: !5) +!13 = !DILocation(line: 29, column: 19, scope: !5) +!14 = !DILocation(line: 28, column: 19, scope: !5) +!15 = !DILocation(line: 39, column: 41, scope: !5) +!16 = !DILocation(line: 39, column: 52, scope: !5) +!17 = !DILocation(line: 39, column: 48, scope: !5) +!18 = !DILocation(line: 39, column: 63, scope: !5) +!19 = !DILocation(line: 39, column: 57, scope: !5) +!20 = !DILocation(line: 39, column: 34, scope: !5) +!21 = !DILocation(line: 39, column: 68, scope: !5) +!22 = !DILocation(line: 39, column: 121, scope: !5) +!23 = !DILocation(line: 40, column: 41, scope: !5) +!24 = !DILocation(line: 40, column: 50, scope: !5) +!25 = !DILocation(line: 40, column: 34, scope: !5) +!26 = !DILocation(line: 40, column: 61, scope: !5) +!27 = !DILocation(line: 40, column: 114, scope: !5) +!28 = !DILocation(line: 42, column: 22, scope: !5) +!29 = !DILocation(line: 47, column: 22, scope: !5) +!30 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !33) +!31 = distinct !DILexicalBlockFile(scope: !5, file: !32, discriminator: 0) +!32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!33 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !34) +!34 = !DILocation(line: 51, column: 25, scope: !35) +!35 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!36 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !37) +!37 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !38) +!38 = !DILocation(line: 52, column: 27, scope: !35) +!39 = !DILocation(line: 58, column: 27, scope: !5) +!40 = !DILocation(line: 62, column: 35, scope: !5) +!41 = !DILocation(line: 62, column: 42, scope: !5) +!42 = !DILocation(line: 62, column: 95, scope: !5) +!43 = !DILocation(line: 63, column: 46, scope: !5) +!44 = !DILocation(line: 63, column: 42, scope: !5) +!45 = !DILocation(line: 63, column: 35, scope: !5) +!46 = !DILocation(line: 63, column: 51, scope: !5) +!47 = !DILocation(line: 64, column: 35, scope: !5) +!48 = !DILocation(line: 64, column: 51, scope: !5) +!49 = !DILocation(line: 65, column: 69, scope: !5) +!50 = !DILocation(line: 66, column: 36, scope: !5) +!51 = !DILocation(line: 66, column: 43, scope: !5) +!52 = !DILocation(line: 71, column: 24, scope: !5) +!53 = !DILocation(line: 72, column: 41, scope: !5) +!54 = !DILocation(line: 72, column: 39, scope: !5) +!55 = !DILocation(line: 72, column: 48, scope: !5) +!56 = !DILocation(line: 72, column: 57, scope: !5) +!57 = !DILocation(line: 72, column: 35, scope: !5) +!58 = !DILocation(line: 72, column: 68, scope: !5) +!59 = !DILocation(line: 75, column: 25, scope: !5) +!60 = !DILocation(line: 77, column: 24, scope: !5) +!61 = !DILocation(line: 78, column: 32, scope: !5) +!62 = !DILocation(line: 79, column: 24, scope: !5) +!63 = !DILocation(line: 80, column: 35, scope: !5) +!64 = !DILocation(line: 80, column: 85, scope: !5) +!65 = !DILocation(line: 87, column: 25, scope: !5) +!66 = !DILocation(line: 90, column: 53, scope: !5) +!67 = !DILocation(line: 90, column: 35, scope: !5) +!68 = !DILocation(line: 90, column: 64, scope: !5) +!69 = !DILocation(line: 98, column: 35, scope: !5) +!70 = !DILocation(line: 98, column: 81, scope: !5) +!71 = !DILocation(line: 111, column: 24, scope: !5) +!72 = !DILocation(line: 113, column: 24, scope: !5) +!73 = !DILocation(line: 116, column: 24, scope: !5) +!74 = !DILocation(line: 121, column: 51, scope: !5) +!75 = !DILocation(line: 121, column: 60, scope: !5) +!76 = !DILocation(line: 121, column: 35, scope: !5) +!77 = !DILocation(line: 121, column: 71, scope: !5) +!78 = !DILocation(line: 123, column: 24, scope: !5) +!79 = !DILocation(line: 124, column: 24, scope: !5) +!80 = !DILocation(line: 125, column: 32, scope: !5) +!81 = !DILocation(line: 121, column: 132, scope: !5) +!82 = !DILocation(line: 72, column: 129, scope: !5) +!83 = !DILocation(line: 80, column: 146, scope: !5) +!84 = !DILocation(line: 82, column: 24, scope: !5) +!85 = !DILocation(line: 84, column: 17, scope: !5) +!86 = !DILocation(line: 90, column: 125, scope: !5) +!87 = !DILocation(line: 97, column: 24, scope: !5) +!88 = !DILocation(line: 98, column: 142, scope: !5) +!89 = !DILocation(line: 100, column: 24, scope: !5) +!90 = !DILocation(line: 0, scope: !5) +!91 = !DILocation(line: 118, column: 24, scope: !5) +!92 = !DILocation(line: 119, column: 24, scope: !5) +!93 = !DILocation(line: 66, column: 96, scope: !5) +!94 = !DILocation(line: 65, column: 123, scope: !5) +!95 = !DILocation(line: 126, column: 24, scope: !5) +!96 = !DILocation(line: 127, column: 35, scope: !5) +!97 = !DILocation(line: 127, column: 85, scope: !5) +!98 = !DILocation(line: 127, column: 146, scope: !5) +!99 = !DILocation(line: 129, column: 24, scope: !5) +!100 = !DILocation(line: 131, column: 17, scope: !5) +!101 = !DILocation(line: 134, column: 51, scope: !5) +!102 = !DILocation(line: 134, column: 60, scope: !5) +!103 = !DILocation(line: 134, column: 35, scope: !5) +!104 = !DILocation(line: 134, column: 71, scope: !5) +!105 = !DILocation(line: 134, column: 132, scope: !5) +!106 = !DILocation(line: 139, column: 24, scope: !5) +!107 = !DILocation(line: 140, column: 35, scope: !5) +!108 = !DILocation(line: 140, column: 81, scope: !5) +!109 = !DILocation(line: 140, column: 142, scope: !5) +!110 = !DILocation(line: 142, column: 24, scope: !5) +!111 = !DILocation(line: 151, column: 25, scope: !5) +!112 = !DILocation(line: 156, column: 26, scope: !5) +!113 = !DILocation(line: 153, column: 26, scope: !5) +!114 = !DILocation(line: 158, column: 26, scope: !5) +!115 = !DILocation(line: 159, column: 26, scope: !5) +!116 = !DILocation(line: 161, column: 43, scope: !5) +!117 = !DILocation(line: 161, column: 39, scope: !5) +!118 = !DILocation(line: 161, column: 32, scope: !5) +!119 = !DILocation(line: 161, column: 55, scope: !5) +!120 = !DILocation(line: 162, column: 32, scope: !5) +!121 = !DILocation(line: 162, column: 56, scope: !5) +!122 = !DILocation(line: 53, column: 4, scope: !5) diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..7aae02eb020363fadb3f82846e2d73e821f99a1b --- /dev/null +++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx @@ -0,0 +1,1435 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10 +) +.reqntid 64 +{ + .reg .pred %p<4>; + .reg .b16 %rs<66>; + .reg .b32 %r<335>; + .reg .b64 %rd<96>; + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd80, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0]; + ld.param.b64 %rd81, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28 + mov.u32 %r22, %ctaid.x; + .loc 1 23 33 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33 + shl.b32 %r23, %r22, 1; + ld.param.b64 %rd82, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2]; + ld.param.b64 %rd83, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3]; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + mov.u32 %r24, %tid.x; + and.b32 %r25, %r24, 32; + bfe.s32 %r26, %r24, 5, 1; + ld.param.b64 %rd84, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4]; + ld.param.b64 %rd85, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5]; + bfe.u32 %r27, %r24, 5, 1; + ld.param.b64 %rd86, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6]; + and.b32 %r28, %r24, 1; + neg.s32 %r29, %r28; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r30, %r27, %r23; + or.b32 %r31, %r23, %r28; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + and.b32 %r32, %r24, 31; + shl.b32 %r33, %r32, 2; + shl.b32 %r34, %r24, 1; + and.b32 %r35, %r34, 126; + and.b32 %r36, %r24, 62; + shr.u32 %r37, %r24, 1; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + bfe.s32 %r38, %r22, 30, 1; + shr.u32 %r39, %r38, 27; + add.s32 %r40, %r30, %r39; + shr.s32 %r41, %r40, 5; + .loc 1 28 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:28:19 + and.b32 %r42, %r40, 33554400; + sub.s32 %r43, %r30, %r42; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + add.s32 %r44, %r31, %r39; + .loc 1 39 52 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:52 + shl.b32 %r45, %r43, 7; + .loc 1 39 48 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:48 + or.b32 %r46, %r45, %r33; + mad.lo.s32 %r47, %r41, 36864, %r46; + .loc 1 39 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57 + add.s32 %r48, %r47, 4096; + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + mad.wide.s32 %rd1, %r48, 2, %rd82; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs34, %rs35}, %r1; + mov.b32 {%rs36, %rs37}, %r2; + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r49, %rs34; + cvt.f32.bf16 %r50, %rs35; + cvt.f32.bf16 %r51, %rs36; + cvt.f32.bf16 %r52, %rs37; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + mad.wide.s32 %rd3, %r47, 2, %rd82; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4, %r3; + mov.u32 %r5, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4; + // end inline asm + mov.b32 {%rs38, %rs39}, %r4; + mov.b32 {%rs40, %rs41}, %r5; + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r53, %rs38; + cvt.f32.bf16 %r54, %rs39; + cvt.f32.bf16 %r55, %rs40; + cvt.f32.bf16 %r56, %rs41; + .loc 1 42 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22 + mul.f32 %r57, %r50, %r50; + .loc 1 47 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22 + mul.f32 %r58, %r54, %r54; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + fma.rn.f32 %r59, %r49, %r49, %r57; + fma.rn.f32 %r60, %r51, %r51, %r59; + fma.rn.f32 %r61, %r52, %r52, %r60; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r62, %r61, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r63, %r61, %r62; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r64, %r63, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r65, %r63, %r64; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r66, %r65, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r67, %r65, %r66; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r68, %r67, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r69, %r67, %r68; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r70, %r69, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r71, %r69, %r70; +$L__tmp12: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + fma.rn.f32 %r72, %r53, %r53, %r58; + fma.rn.f32 %r73, %r55, %r55, %r72; + fma.rn.f32 %r74, %r56, %r56, %r73; +$L__tmp13: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r75, %r74, 16, 31, -1; +$L__tmp14: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r76, %r74, %r75; +$L__tmp15: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r77, %r76, 8, 31, -1; +$L__tmp16: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r78, %r76, %r77; +$L__tmp17: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r79, %r78, 4, 31, -1; +$L__tmp18: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r80, %r78, %r79; +$L__tmp19: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r81, %r80, 2, 31, -1; +$L__tmp20: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r82, %r80, %r81; +$L__tmp21: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1; +$L__tmp22: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r84, %r82, %r83; +$L__tmp23: + .loc 1 62 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35 + mul.wide.u32 %rd87, %r35, 2; + add.s64 %rd5, %rd83, %rd87; + .loc 1 62 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b32 { %r6 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 62 95 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95 + mov.b32 {%rs42, %rs43}, %r6; + cvt.f32.bf16 %r85, %rs43; + cvt.f32.bf16 %r86, %rs42; + .loc 1 63 46 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46 + shl.b32 %r87, %r41, 7; + .loc 1 63 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42 + or.b32 %r88, %r87, %r33; + .loc 1 63 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35 + mul.wide.s32 %rd88, %r88, 4; + add.s64 %rd7, %rd84, %rd88; + .loc 1 63 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51 + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r7, %r3; + mov.u32 %r8, %r3; + mov.u32 %r9, %r3; + mov.u32 %r10, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r7, %r8, %r9, %r10 }, [ %rd7 + 0 ], %rd8; + // end inline asm + shl.b32 %r89, %r24, 4; + and.b32 %r90, %r89, 112; + and.b32 %r91, %r24, 24; + shr.u32 %r92, %r91, 1; + and.b32 %r93, %r26, 192; + or.b32 %r94, %r90, %r92; + xor.b32 %r95, %r94, %r93; + mov.b32 %r96, global_smem; + add.s32 %r97, %r96, %r95; + st.shared.b32 [%r97], %r7; + xor.b32 %r98, %r95, 4; + add.s32 %r99, %r96, %r98; + st.shared.b32 [%r99+256], %r8; + xor.b32 %r100, %r95, 8; + add.s32 %r101, %r96, %r100; + st.shared.b32 [%r101+512], %r9; + xor.b32 %r102, %r95, 12; + add.s32 %r103, %r96, %r102; + st.shared.b32 [%r103+768], %r10; + bar.sync 0; + shl.b32 %r104, %r24, 7; + and.b32 %r105, %r104, 768; + shl.b32 %r106, %r36, 1; + and.b32 %r107, %r29, 192; + xor.b32 %r108, %r107, %r106; + or.b32 %r109, %r108, %r105; + add.s32 %r110, %r96, %r109; + ld.shared.b32 %r111, [%r110]; + xor.b32 %r112, %r109, 4; + add.s32 %r113, %r96, %r112; + ld.shared.b32 %r114, [%r113]; + xor.b32 %r115, %r109, 8; + add.s32 %r116, %r96, %r115; + ld.shared.b32 %r117, [%r116]; + xor.b32 %r118, %r109, 12; + add.s32 %r119, %r96, %r118; + ld.shared.b32 %r120, [%r119]; + .loc 1 64 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35 + add.s64 %rd9, %rd85, %rd88; + .loc 1 64 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51 + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r11, %r3; + mov.u32 %r12, %r3; + mov.u32 %r13, %r3; + mov.u32 %r14, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r11, %r12, %r13, %r14 }, [ %rd9 + 0 ], %rd10; + // end inline asm + bar.sync 0; + st.shared.b32 [%r97], %r11; + st.shared.b32 [%r99+256], %r12; + st.shared.b32 [%r101+512], %r13; + st.shared.b32 [%r103+768], %r14; + bar.sync 0; + ld.shared.b32 %r121, [%r110]; + ld.shared.b32 %r122, [%r113]; + ld.shared.b32 %r123, [%r116]; + ld.shared.b32 %r124, [%r119]; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r15, %r3; + mov.u32 %r16, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r15, %r16 }, [ %rd1 + 0 ], %rd11; + // end inline asm + .loc 1 66 36 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36 + add.s64 %rd12, %rd86, %rd87; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r17, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b32 { %r17 }, [ %rd12 + 0 ], %rd13; + // end inline asm + .loc 1 71 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:71:24 + and.b32 %r125, %r37, 1; + setp.ne.b32 %p3, %r125, 0; + not.pred %p2, %p3; + .loc 1 72 41 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41 + and.b32 %r126, %r37, 30; + .loc 1 72 48 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:48 + shl.b32 %r127, %r31, 7; + shl.b32 %r128, %r44, 10; + and.b32 %r129, %r128, -32768; + add.s32 %r130, %r129, %r127; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + cvt.s64.s32 %rd89, %r130; + cvt.u64.u32 %rd90, %r126; + or.b64 %rd91, %rd89, %rd90; + shl.b64 %rd92, %rd91, 1; + add.s64 %rd93, %rd82, %rd92; + add.s64 %rd14, %rd93, 2; + add.s64 %rd16, %rd93, 66; + add.s64 %rd18, %rd93, 130; + add.s64 %rd20, %rd93, 194; + .loc 1 72 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0; + // end inline asm + mov.b16 %rs2, 0; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd14 + 0 ], %rd15; + // end inline asm + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd16 + 0 ], %rd17; + // end inline asm + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs4, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd18 + 0 ], %rd19; + // end inline asm + // begin inline asm + mov.u64 %rd21, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd20 + 0 ], %rd21; + // end inline asm + mov.b32 %r131, 0f43000000; + .loc 1 75 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25 + div.full.f32 %r132, %r84, %r131; + .loc 1 77 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24 + add.f32 %r133, %r132, 0f358637BD; + .loc 1 78 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32 + rsqrt.approx.ftz.f32 %r134, %r133; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + bar.sync 0; + shr.u32 %r135, %r25, 3; + add.s32 %r136, %r96, %r135; + st.shared.b32 [%r136], %r134; + bar.sync 0; + shl.b32 %r137, %r28, 2; + add.s32 %r138, %r96, %r137; + ld.shared.b32 %r139, [%r138]; + .loc 1 80 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35 + mul.wide.u32 %rd94, %r126, 2; + add.s64 %rd38, %rd83, %rd94; + add.s64 %rd22, %rd38, 2; + add.s64 %rd24, %rd38, 66; + add.s64 %rd26, %rd38, 130; + add.s64 %rd28, %rd38, 194; + .loc 1 80 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs6, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd22 + 0 ], %rd23; + // end inline asm + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs7, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd24 + 0 ], %rd25; + // end inline asm + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs8, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd26 + 0 ], %rd27; + // end inline asm + // begin inline asm + mov.u64 %rd29, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs9, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd28 + 0 ], %rd29; + // end inline asm + .loc 1 90 53 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53 + or.b32 %r140, %r130, %r126; + .loc 1 90 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35 + mad.wide.s32 %rd30, %r140, 2, %rd82; + add.s64 %rd32, %rd93, 64; + add.s64 %rd34, %rd93, 128; + add.s64 %rd36, %rd93, 192; + .loc 1 90 64 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs10, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd30 + 0 ], %rd31; + // end inline asm + // begin inline asm + mov.u64 %rd33, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd32 + 0 ], %rd33; + // end inline asm + // begin inline asm + mov.u64 %rd35, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd34 + 0 ], %rd35; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd36 + 0 ], %rd37; + // end inline asm + .loc 1 98 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35 + add.s64 %rd40, %rd38, 64; + add.s64 %rd42, %rd38, 128; + add.s64 %rd44, %rd38, 192; + .loc 1 98 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81 + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs14, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd38 + 0 ], %rd39; + // end inline asm + // begin inline asm + mov.u64 %rd41, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd40 + 0 ], %rd41; + // end inline asm + // begin inline asm + mov.u64 %rd43, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs16, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd42 + 0 ], %rd43; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd44 + 0 ], %rd45; + // end inline asm + .loc 1 111 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24 + mul.f32 %r141, %r134, %r53; + mul.f32 %r142, %r134, %r54; + mul.f32 %r143, %r134, %r55; + mul.f32 %r144, %r134, %r56; + .loc 1 113 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24 + bar.sync 0; + shl.b32 %r145, %r36, 2; + and.b32 %r146, %r29, 320; + xor.b32 %r147, %r146, %r145; + add.s32 %r148, %r96, %r147; + st.shared.v2.b32 [%r148], {%r86, %r85}; + bar.sync 0; + shl.b32 %r149, %r32, 3; + add.s32 %r150, %r96, %r149; + ld.shared.v2.b32 {%r151, %r152}, [%r150]; + xor.b32 %r153, %r149, 64; + add.s32 %r154, %r96, %r153; + ld.shared.v2.b32 {%r155, %r156}, [%r154+256]; + mul.f32 %r157, %r141, %r151; + mul.f32 %r158, %r142, %r152; + mul.f32 %r159, %r143, %r155; + mul.f32 %r160, %r144, %r156; + .loc 1 116 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24 + mul.f32 %r161, %r157, %r7; + mul.f32 %r162, %r158, %r8; + mul.f32 %r163, %r159, %r9; + mul.f32 %r164, %r160, %r10; + bar.sync 0; + st.shared.b32 [%r97], %r161; + st.shared.b32 [%r99+256], %r162; + st.shared.b32 [%r101+512], %r163; + st.shared.b32 [%r103+768], %r164; + bar.sync 0; + ld.shared.b32 %r165, [%r110]; + ld.shared.b32 %r166, [%r113]; + ld.shared.b32 %r167, [%r116]; + ld.shared.b32 %r168, [%r119]; + .loc 1 121 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60 + add.s32 %r169, %r140, 4097; + add.s32 %r170, %r140, 4129; + add.s32 %r171, %r140, 4161; + add.s32 %r172, %r140, 4193; + .loc 1 121 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35 + mad.wide.s32 %rd46, %r169, 2, %rd82; + mad.wide.s32 %rd48, %r170, 2, %rd82; + mad.wide.s32 %rd50, %r171, 2, %rd82; + mad.wide.s32 %rd52, %r172, 2, %rd82; + .loc 1 121 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71 + // begin inline asm + mov.u64 %rd47, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs18, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd46 + 0 ], %rd47; + // end inline asm + // begin inline asm + mov.u64 %rd49, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd48 + 0 ], %rd49; + // end inline asm + // begin inline asm + mov.u64 %rd51, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs20, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd50 + 0 ], %rd51; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd52 + 0 ], %rd53; + // end inline asm + .loc 1 123 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24 + div.full.f32 %r173, %r71, %r131; + .loc 1 124 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24 + add.f32 %r174, %r173, 0f358637BD; + .loc 1 125 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32 + rsqrt.approx.ftz.f32 %r175, %r174; + .loc 1 121 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132 + cvt.f32.bf16 %r176, %rs21; + cvt.f32.bf16 %r177, %rs20; + cvt.f32.bf16 %r178, %rs19; + cvt.f32.bf16 %r179, %rs18; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r180, %rs5; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r181, %r139, %r180; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r182, %rs9; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r183, %r181; + fma.rn.f32 %r184, %r183, %r182, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r185, %rs13; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r186, %r139, %r185; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r187, %rs17; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r188, %r186, %r187; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r189, %r188, %r184, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r190, %r124, %r189, %r168; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r191, %rs4; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r192, %r139, %r191; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r193, %rs8; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r194, %r192; + fma.rn.f32 %r195, %r194, %r193, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r196, %rs12; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r197, %r139, %r196; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r198, %rs16; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r199, %r197, %r198; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r200, %r199, %r195, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r201, %r123, %r200, %r167; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r202, %rs3; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r203, %r139, %r202; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r204, %rs7; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r205, %r203; + fma.rn.f32 %r206, %r205, %r204, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r207, %rs11; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r208, %r139, %r207; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r209, %rs15; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r210, %r208, %r209; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r211, %r210, %r206, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r212, %r122, %r211, %r166; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r213, %rs1; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r214, %r139, %r213; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r215, %rs6; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r216, %r214; + fma.rn.f32 %r217, %r216, %r215, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r218, %rs10; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r219, %r139, %r218; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r220, %rs14; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r221, %r219, %r220; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r222, %r221, %r217, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r223, %r121, %r222, %r165; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + mov.b32 {%rs44, %rs45}, %r17; + .loc 1 66 96 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96 + cvt.f32.bf16 %r224, %rs45; + cvt.f32.bf16 %r225, %rs44; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + mov.b32 {%rs46, %rs47}, %r16; + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r226, %rs47; + cvt.f32.bf16 %r227, %rs46; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + mov.b32 {%rs48, %rs49}, %r15; + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r228, %rs49; + cvt.f32.bf16 %r229, %rs48; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + bar.sync 0; + st.shared.b32 [%r136], %r175; + bar.sync 0; + ld.shared.b32 %r230, [%r138]; + mul.f32 %r231, %r230, %r179; + mul.f32 %r232, %r230, %r178; + mul.f32 %r233, %r230, %r177; + mul.f32 %r234, %r230, %r176; + .loc 1 127 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35 + add.s64 %rd70, %rd86, %rd94; + add.s64 %rd54, %rd70, 2; + add.s64 %rd56, %rd70, 66; + add.s64 %rd58, %rd70, 130; + add.s64 %rd60, %rd70, 194; + .loc 1 127 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85 + // begin inline asm + mov.u64 %rd55, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs22, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd54 + 0 ], %rd55; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd56 + 0 ], %rd57; + // end inline asm + // begin inline asm + mov.u64 %rd59, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd59, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs24, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd58 + 0 ], %rd59; + // end inline asm + // begin inline asm + mov.u64 %rd61, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd61, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs25, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd60 + 0 ], %rd61; + // end inline asm + .loc 1 127 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146 + cvt.f32.bf16 %r235, %rs22; + cvt.f32.bf16 %r236, %rs23; + cvt.f32.bf16 %r237, %rs24; + cvt.f32.bf16 %r238, %rs25; + .loc 1 131 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17 + neg.f32 %r239, %r231; + fma.rn.f32 %r240, %r239, %r235, 0f00000000; + neg.f32 %r241, %r232; + fma.rn.f32 %r242, %r241, %r236, 0f00000000; + neg.f32 %r243, %r233; + fma.rn.f32 %r244, %r243, %r237, 0f00000000; + neg.f32 %r245, %r234; + fma.rn.f32 %r246, %r245, %r238, 0f00000000; + .loc 1 134 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60 + add.s32 %r247, %r140, 4096; + add.s32 %r248, %r140, 4128; + add.s32 %r249, %r140, 4160; + add.s32 %r250, %r140, 4192; + .loc 1 134 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35 + mad.wide.s32 %rd62, %r247, 2, %rd82; + mad.wide.s32 %rd64, %r248, 2, %rd82; + mad.wide.s32 %rd66, %r249, 2, %rd82; + mad.wide.s32 %rd68, %r250, 2, %rd82; + .loc 1 134 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71 + // begin inline asm + mov.u64 %rd63, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs26, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd62 + 0 ], %rd63; + // end inline asm + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd64 + 0 ], %rd65; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs28, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd66 + 0 ], %rd67; + // end inline asm + // begin inline asm + mov.u64 %rd69, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd69, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs29, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd68 + 0 ], %rd69; + // end inline asm + .loc 1 134 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132 + cvt.f32.bf16 %r251, %rs26; + cvt.f32.bf16 %r252, %rs27; + cvt.f32.bf16 %r253, %rs28; + cvt.f32.bf16 %r254, %rs29; + .loc 1 139 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24 + mul.f32 %r255, %r230, %r251; + mul.f32 %r256, %r230, %r252; + mul.f32 %r257, %r230, %r253; + mul.f32 %r258, %r230, %r254; + .loc 1 140 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35 + add.s64 %rd72, %rd70, 64; + add.s64 %rd74, %rd70, 128; + add.s64 %rd76, %rd70, 192; + .loc 1 140 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81 + // begin inline asm + mov.u64 %rd71, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs30, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd70 + 0 ], %rd71; + // end inline asm + // begin inline asm + mov.u64 %rd73, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd73, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs31, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd72 + 0 ], %rd73; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd75, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs32, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd74 + 0 ], %rd75; + // end inline asm + // begin inline asm + mov.u64 %rd77, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd77, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs33, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd76 + 0 ], %rd77; + // end inline asm + .loc 1 140 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142 + cvt.f32.bf16 %r259, %rs30; + cvt.f32.bf16 %r260, %rs31; + cvt.f32.bf16 %r261, %rs32; + cvt.f32.bf16 %r262, %rs33; + .loc 1 142 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24 + mul.f32 %r263, %r255, %r259; + mul.f32 %r264, %r256, %r260; + mul.f32 %r265, %r257, %r261; + mul.f32 %r266, %r258, %r262; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r267, %r263, %r240, %p3; + selp.f32 %r268, %r264, %r242, %p3; + selp.f32 %r269, %r265, %r244, %p3; + selp.f32 %r270, %r266, %r246, %p3; + .loc 1 151 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25 + mul.f32 %r271, %r175, %r229; + mul.f32 %r272, %r175, %r228; + mul.f32 %r273, %r175, %r227; + mul.f32 %r274, %r175, %r226; + bar.sync 0; + st.shared.b32 [%r97], %r271; + st.shared.b32 [%r99+256], %r272; + st.shared.b32 [%r101+512], %r273; + st.shared.b32 [%r103+768], %r274; + bar.sync 0; + ld.shared.b32 %r275, [%r110]; + ld.shared.b32 %r276, [%r113]; + ld.shared.b32 %r277, [%r116]; + ld.shared.b32 %r278, [%r119]; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + bar.sync 0; + shl.b32 %r279, %r24, 3; + and.b32 %r280, %r279, 120; + shr.u32 %r281, %r24, 2; + and.b32 %r282, %r281, 4; + shl.b32 %r283, %r25, 2; + or.b32 %r284, %r282, %r283; + or.b32 %r285, %r284, %r280; + add.s32 %r286, %r96, %r285; + st.shared.b32 [%r286], %r225; + xor.b32 %r287, %r285, 64; + add.s32 %r288, %r96, %r287; + st.shared.b32 [%r288+256], %r224; + bar.sync 0; + and.b32 %r289, %r34, 120; + bfe.s32 %r290, %r24, 1, 1; + and.b32 %r291, %r290, 320; + xor.b32 %r292, %r291, %r289; + add.s32 %r293, %r96, %r292; + ld.shared.v2.b32 {%r294, %r295}, [%r293]; + ld.shared.v2.b32 {%r296, %r297}, [%r293+128]; + .loc 1 153 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26 + mul.f32 %r298, %r275, %r294; + mul.f32 %r299, %r276, %r295; + mul.f32 %r300, %r277, %r296; + mul.f32 %r301, %r278, %r297; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + mul.f32 %r302, %r111, %r298; + mul.f32 %r303, %r114, %r299; + mul.f32 %r304, %r117, %r300; + mul.f32 %r305, %r120, %r301; + .loc 1 159 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26 + fma.rn.f32 %r306, %r121, %r267, %r302; + fma.rn.f32 %r307, %r122, %r268, %r303; + fma.rn.f32 %r308, %r123, %r269, %r304; + fma.rn.f32 %r309, %r124, %r270, %r305; + .loc 1 161 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:43 + shl.b32 %r310, %r30, 7; + .loc 1 161 39 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39 + or.b32 %r311, %r310, %r33; + .loc 1 161 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32 + mul.wide.s32 %rd95, %r311, 2; + add.s64 %rd78, %rd80, %rd95; + .loc 1 161 55 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55 + cvt.rn.bf16.f32 %rs50, %r223; + cvt.rn.bf16.f32 %rs51, %r212; + cvt.rn.bf16.f32 %rs52, %r201; + cvt.rn.bf16.f32 %rs53, %r190; + bar.sync 0; + and.b32 %r312, %r24, 3; + and.b32 %r313, %r24, 28; + shr.u32 %r314, %r25, 4; + mul.lo.s32 %r315, %r312, 160; + or.b32 %r316, %r315, %r314; + or.b32 %r317, %r316, %r313; + add.s32 %r318, %r96, %r317; + st.shared.b16 [%r318], %rs50; + xor.b32 %r319, %r317, 32; + add.s32 %r320, %r96, %r319; + st.shared.b16 [%r320], %rs51; + xor.b32 %r321, %r317, 64; + add.s32 %r322, %r96, %r321; + st.shared.b16 [%r322], %rs52; + xor.b32 %r323, %r317, 96; + add.s32 %r324, %r96, %r323; + st.shared.b16 [%r324], %rs53; + bar.sync 0; + shl.b32 %r325, %r312, 3; + shl.b32 %r326, %r91, 2; + and.b32 %r327, %r37, 2; + and.b32 %r328, %r26, 160; + or.b32 %r329, %r325, %r326; + xor.b32 %r330, %r329, %r328; + or.b32 %r331, %r330, %r327; + add.s32 %r332, %r96, %r331; + ld.shared.b16 %rs54, [%r332]; + ld.shared.b16 %rs55, [%r332+4]; + xor.b32 %r333, %r331, 64; + add.s32 %r334, %r96, %r333; + ld.shared.b16 %rs56, [%r334+256]; + ld.shared.b16 %rs57, [%r334+260]; + mov.b32 %r18, {%rs54, %rs56}; + mov.b32 %r19, {%rs55, %rs57}; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd78 + 0 ], { %r18, %r19 }; + // end inline asm + .loc 1 162 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32 + add.s64 %rd79, %rd81, %rd95; + .loc 1 162 56 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56 + cvt.rn.bf16.f32 %rs58, %r306; + cvt.rn.bf16.f32 %rs59, %r307; + cvt.rn.bf16.f32 %rs60, %r308; + cvt.rn.bf16.f32 %rs61, %r309; + bar.sync 0; + st.shared.b16 [%r318], %rs58; + st.shared.b16 [%r320], %rs59; + st.shared.b16 [%r322], %rs60; + st.shared.b16 [%r324], %rs61; + bar.sync 0; + ld.shared.b16 %rs62, [%r332]; + ld.shared.b16 %rs63, [%r332+4]; + ld.shared.b16 %rs64, [%r334+256]; + ld.shared.b16 %rs65, [%r334+260]; + mov.b32 %r20, {%rs62, %rs64}; + mov.b32 %r21, {%rs63, %rs65}; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd79 + 0 ], { %r20, %r21 }; + // end inline asm + .loc 1 53 4 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4 + ret; +$L__tmp24: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 456 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 118 +.b8 113 +.b8 104 +.b8 106 +.b8 116 +.b8 121 +.b8 103 +.b8 55 +.b8 102 +.b8 118 +.b8 120 +.b8 122 +.b8 119 +.b8 116 +.b8 98 +.b8 116 +.b8 116 +.b8 52 +.b8 118 +.b8 114 +.b8 100 +.b8 107 +.b8 98 +.b8 110 +.b8 98 +.b8 54 +.b8 110 +.b8 51 +.b8 50 +.b8 102 +.b8 110 +.b8 114 +.b8 105 +.b8 106 +.b8 106 +.b8 112 +.b8 108 +.b8 51 +.b8 118 +.b8 118 +.b8 52 +.b8 99 +.b8 102 +.b8 113 +.b8 100 +.b8 52 +.b8 109 +.b8 122 +.b8 110 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 98 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 101 +.b8 103 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 119 +.b8 105 +.b8 116 +.b8 104 +.b8 95 +.b8 115 +.b8 105 +.b8 122 +.b8 101 +.b8 115 +.b8 95 +.b8 115 +.b8 116 +.b8 97 +.b8 99 +.b8 107 +.b8 95 +.b8 117 +.b8 110 +.b8 98 +.b8 105 +.b8 110 +.b8 100 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x151:0x7a DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 4 // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..1e72ed51c4e737348c551db3fbc792111a227cb0 --- /dev/null +++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source @@ -0,0 +1,972 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc215 = loc(unknown) +#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc222 = loc("in_out_ptr0"(#loc)) +#loc223 = loc("in_out_ptr1"(#loc)) +#loc224 = loc("in_ptr0"(#loc)) +#loc225 = loc("in_ptr1"(#loc)) +#loc226 = loc("in_ptr2"(#loc)) +#loc227 = loc("in_ptr3"(#loc)) +#loc228 = loc("in_ptr4"(#loc)) +#loc229 = loc("xnumel"(#loc)) +#loc230 = loc("r0_numel"(#loc)) +#loc432 = loc("input"(#loc213)) +#loc433 = loc("a"(#loc218)) +#loc434 = loc("b"(#loc218)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 73728 : i32 loc(#loc231) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc232) + %xoffset = tt.get_program_id x : i32 loc(#loc233) + %xoffset_2 = arith.constant 2 : i32 loc(#loc234) + %xoffset_3 = arith.constant 2 : i32 loc(#loc234) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc235) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc236) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc237) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc237) + %xmask = arith.constant true loc(#loc238) + %xmask_8 = arith.constant dense : tensor<2x128xi1> loc(#loc238) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc239) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc240) + %x0 = arith.constant 32 : i32 loc(#loc241) + %x0_10 = arith.constant 32 : i32 loc(#loc241) + %x0_11 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc241) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<2x1xi32> loc(#loc241) + %x1 = arith.constant 32 : i32 loc(#loc242) + %x1_13 = arith.constant 32 : i32 loc(#loc242) + %x1_14 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc242) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<2x1xi32> loc(#loc242) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc243) + %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244) + %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc244) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c128_i32 = arith.constant 128 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<2x128xf32>, tensor<2x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc246) + %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc246) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc247) + %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x128xi32> loc(#loc247) + %tmp0 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_27 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_28 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc248) + %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x128xi32> loc(#loc248) + %tmp0_30 = arith.constant 128 : i32 loc(#loc249) + %tmp0_31 = arith.constant 128 : i32 loc(#loc249) + %tmp0_32 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc249) + %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<2x1xi32> loc(#loc249) + %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc250) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc250) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<2x128xi32> loc(#loc250) + %tmp0_37 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_38 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_39 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc251) + %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<2x1xi32> loc(#loc251) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc252) + %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<2x128xi32> loc(#loc252) + %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc253) + %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc253) + %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254) + %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc254) + %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc254) + %tmp0_48 = arith.truncf %tmp0_47 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc254) + %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc254) + %tmp0_50 = arith.extf %tmp0_49 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc255) + %tmp6 = arith.constant 128 : i32 loc(#loc256) + %tmp6_51 = arith.constant 128 : i32 loc(#loc256) + %tmp6_52 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc256) + %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<2x1xi32> loc(#loc256) + %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc257) + %tmp6_55 = tt.broadcast %tmp6_53 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc257) + %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<2x128xi32> loc(#loc257) + %tmp6_57 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_58 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_59 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc258) + %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<2x1xi32> loc(#loc258) + %tmp6_61 = tt.broadcast %tmp6_60 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc259) + %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<2x128xi32> loc(#loc259) + %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc260) + %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc260) + %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261) + %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc261) + %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc261) + %tmp6_68 = arith.truncf %tmp6_67 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc261) + %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc261) + %tmp6_70 = arith.extf %tmp6_69 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc262) + %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<2x128xf32> loc(#loc263) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<2x128xf32> loc(#loc264) + %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc265) + %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc265) + %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<2x128xf32> loc(#loc266) + %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<2x128xf32> loc(#loc267) + %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc268) + %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc268) + scf.yield %_tmp4_72, %_tmp10_74 : tensor<2x128xf32>, tensor<2x128xf32> loc(#loc39) + } loc(#loc435) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc269) + %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc270) + %tmp10 = tt.call @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc271) + %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc272) + %c0_i32_21 = arith.constant 0 : i32 loc(#loc44) + %c128_i32_22 = arith.constant 128 : i32 loc(#loc44) + %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44) + %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44) + %6 = arith.bitcast %c128_i32_22 : i32 to i32 loc(#loc44) + %7 = ub.poison : i32 loc(#loc44) + scf.for %r0_offset = %4 to %5 step %6 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc273) + %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc273) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc274) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x128xi32> loc(#loc274) + %r0_3 = arith.constant 2 : i32 loc(#loc275) + %r0_3_25 = arith.constant 2 : i32 loc(#loc275) + %r0_3_26 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc275) + %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x128xi32> loc(#loc275) + %r0_4 = arith.constant 2 : i32 loc(#loc276) + %r0_4_28 = arith.constant 2 : i32 loc(#loc276) + %r0_4_29 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc276) + %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x128xi32> loc(#loc276) + %tmp50 = arith.constant 128 : i32 loc(#loc277) + %tmp50_31 = arith.constant 128 : i32 loc(#loc277) + %tmp50_32 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc277) + %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<2x1xi32> loc(#loc277) + %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc278) + %tmp50_35 = tt.broadcast %tmp50_33 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc278) + %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<2x128xi32> loc(#loc278) + %tmp50_37 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_38 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_39 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc279) + %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<2x1xi32> loc(#loc279) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc280) + %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<2x128xi32> loc(#loc280) + %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc281) + %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc281) + %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282) + %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc282) + %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc282) + %tmp50_48 = arith.truncf %tmp50_47 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc282) + %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc282) + %tmp50_50 = arith.extf %tmp50_49 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc283) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc284) + %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc284) + %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285) + %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc285) + %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc285) + %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc285) + %tmp58_56 = arith.extf %tmp58_55 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc286) + %tmp63 = arith.constant 128 : i32 loc(#loc287) + %tmp63_57 = arith.constant 128 : i32 loc(#loc287) + %tmp63_58 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc287) + %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<2x1xi32> loc(#loc287) + %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc288) + %tmp63_61 = tt.broadcast %tmp63_59 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc288) + %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<2x128xi32> loc(#loc288) + %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc289) + %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc289) + %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290) + %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc290) + %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc290) + %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc290) + %tmp66 = arith.constant 128 : i32 loc(#loc291) + %tmp66_69 = arith.constant 128 : i32 loc(#loc291) + %tmp66_70 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc291) + %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<2x1xi32> loc(#loc291) + %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc292) + %tmp66_73 = tt.broadcast %tmp66_71 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc292) + %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<2x128xi32> loc(#loc292) + %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc293) + %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc293) + %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294) + %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc294) + %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc294) + %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc294) + %tmp96 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_81 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_82 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc295) + %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x128xi32> loc(#loc295) + %tmp96_84 = arith.constant 128 : i32 loc(#loc296) + %tmp96_85 = arith.constant 128 : i32 loc(#loc296) + %tmp96_86 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc296) + %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<2x1xi32> loc(#loc296) + %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc297) + %tmp96_89 = tt.broadcast %tmp96_87 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc297) + %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<2x128xi32> loc(#loc297) + %tmp96_91 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_92 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_93 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc298) + %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<2x1xi32> loc(#loc298) + %tmp96_95 = tt.broadcast %tmp96_94 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc299) + %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<2x128xi32> loc(#loc299) + %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc300) + %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc300) + %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301) + %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc301) + %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc301) + %tmp96_102 = arith.truncf %tmp96_101 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc301) + %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<2x128x!tt.ptr> loc(#loc301) + %tmp96_104 = arith.extf %tmp96_103 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc302) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc303) + %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc303) + %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304) + %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc304) + %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc304) + %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc304) + %tmp102_110 = arith.extf %tmp102_109 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc305) + %tmp13 = arith.constant 0 : i64 loc(#loc306) + %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306) + %tmp14 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc307) + %tmp14_112 = arith.constant dense<0> : tensor<1x128xi64> loc(#loc307) + %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x128xi64> loc(#loc307) + %tmp15 = arith.constant 1 : i64 loc(#loc308) + %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308) + %tmp16 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc309) + %tmp16_115 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc309) + %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x128xi64> loc(#loc309) + %tmp17 = arith.constant 2 : i32 loc(#loc310) + %tmp17_117 = arith.constant 2 : i32 loc(#loc310) + %tmp17_118 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc310) + %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x128xi32> loc(#loc310) + %tmp17_120 = arith.constant 1 : i32 loc(#loc311) + %tmp17_121 = arith.constant 1 : i32 loc(#loc311) + %tmp17_122 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc311) + %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x128xi32> loc(#loc311) + %tmp17_124 = arith.constant 128 : i32 loc(#loc312) + %tmp17_125 = arith.constant 128 : i32 loc(#loc312) + %tmp17_126 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc312) + %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<2x1xi32> loc(#loc312) + %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc313) + %tmp17_129 = tt.broadcast %tmp17_127 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc313) + %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<2x128xi32> loc(#loc313) + %tmp17_131 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_132 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_133 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc314) + %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<2x1xi32> loc(#loc314) + %tmp17_135 = tt.broadcast %tmp17_134 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc315) + %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<2x128xi32> loc(#loc315) + %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc316) + %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc316) + %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc317) + %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318) + %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc318) + %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc318) + %tmp17_143 = arith.truncf %tmp17_142 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc318) + %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc318) + %tmp17_145 = arith.extf %tmp17_144 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc319) + %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320) + %tmp20 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc321) + %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<2x1xf32> loc(#loc321) + %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322) + %tmp22 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc323) + %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<2x1xf32> loc(#loc323) + %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc324) + %tmp24 = tt.broadcast %tmp23 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc325) + %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<2x128xf32> loc(#loc325) + %tmp25 = arith.constant 2 : i32 loc(#loc326) + %tmp25_149 = arith.constant 2 : i32 loc(#loc326) + %tmp25_150 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc326) + %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x128xi32> loc(#loc326) + %tmp25_152 = arith.constant 1 : i32 loc(#loc327) + %tmp25_153 = arith.constant 1 : i32 loc(#loc327) + %tmp25_154 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc327) + %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x128xi32> loc(#loc327) + %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc328) + %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc329) + %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc329) + %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc330) + %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331) + %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc331) + %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc331) + %tmp25_163 = arith.truncf %tmp25_162 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc331) + %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc331) + %tmp25_165 = arith.extf %tmp25_164 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc332) + %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<2x128xf32> loc(#loc333) + %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334) + %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc334) + %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<2x128xf32> loc(#loc334) + %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335) + %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc335) + %tmp31 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc336) + %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc336) + %tmp32 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc337) + %tmp32_170 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc337) + %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x128xi64> loc(#loc337) + %tmp33 = arith.constant 2 : i64 loc(#loc338) + %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338) + %tmp34 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc339) + %tmp34_173 = arith.constant dense<2> : tensor<1x128xi64> loc(#loc339) + %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x128xi64> loc(#loc339) + %tmp35 = arith.constant 2 : i32 loc(#loc340) + %tmp35_175 = arith.constant 2 : i32 loc(#loc340) + %tmp35_176 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc340) + %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x128xi32> loc(#loc340) + %tmp35_178 = arith.constant 128 : i32 loc(#loc341) + %tmp35_179 = arith.constant 128 : i32 loc(#loc341) + %tmp35_180 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc341) + %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<2x1xi32> loc(#loc341) + %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc342) + %tmp35_183 = tt.broadcast %tmp35_181 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc342) + %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<2x128xi32> loc(#loc342) + %tmp35_185 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_186 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_187 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc343) + %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<2x1xi32> loc(#loc343) + %tmp35_189 = tt.broadcast %tmp35_188 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc344) + %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<2x128xi32> loc(#loc344) + %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc345) + %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc345) + %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc346) + %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347) + %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc347) + %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc347) + %tmp35_197 = arith.truncf %tmp35_196 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc347) + %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc347) + %tmp35_199 = arith.extf %tmp35_198 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc348) + %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349) + %tmp38 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc350) + %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<2x1xf32> loc(#loc350) + %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351) + %tmp40 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc352) + %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<2x1xf32> loc(#loc352) + %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc353) + %tmp42 = tt.broadcast %tmp41 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc354) + %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<2x128xf32> loc(#loc354) + %tmp43 = arith.constant 2 : i32 loc(#loc355) + %tmp43_203 = arith.constant 2 : i32 loc(#loc355) + %tmp43_204 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc355) + %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x128xi32> loc(#loc355) + %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc356) + %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc357) + %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc357) + %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc358) + %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359) + %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc359) + %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc359) + %tmp43_213 = arith.truncf %tmp43_212 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc359) + %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc359) + %tmp43_215 = arith.extf %tmp43_214 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc360) + %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<2x128xf32> loc(#loc361) + %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362) + %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc362) + %tmp48 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc363) + %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc363) + %tmp49 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc364) + %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc364) + %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365) + %tmp53 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc366) + %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<2x1xf32> loc(#loc366) + %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367) + %tmp55 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc368) + %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<2x1xf32> loc(#loc368) + %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc369) + %tmp57 = tt.broadcast %tmp56 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc370) + %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<2x128xf32> loc(#loc370) + %tmp60 = tt.broadcast %tmp58_56 : tensor<1x128xf32> -> tensor<2x128xf32> loc(#loc371) + %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<2x128xf32> loc(#loc371) + %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<2x128xf32> loc(#loc372) + %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<2x128xf32> loc(#loc373) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<2x128xf32> loc(#loc374) + %tmp70 = arith.constant 2 : i32 loc(#loc375) + %tmp70_223 = arith.constant 2 : i32 loc(#loc375) + %tmp70_224 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc375) + %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x128xi32> loc(#loc375) + %tmp70_226 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_227 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_228 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc376) + %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x128xi32> loc(#loc376) + %tmp70_230 = arith.constant 128 : i32 loc(#loc377) + %tmp70_231 = arith.constant 128 : i32 loc(#loc377) + %tmp70_232 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc377) + %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<2x1xi32> loc(#loc377) + %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc378) + %tmp70_235 = tt.broadcast %tmp70_233 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc378) + %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<2x128xi32> loc(#loc378) + %tmp70_237 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_238 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_239 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc379) + %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<2x1xi32> loc(#loc379) + %tmp70_241 = tt.broadcast %tmp70_240 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc380) + %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<2x128xi32> loc(#loc380) + %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc381) + %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc381) + %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc382) + %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383) + %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc383) + %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc383) + %tmp70_249 = arith.truncf %tmp70_248 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc383) + %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc383) + %tmp70_251 = arith.extf %tmp70_250 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc384) + %tmp72 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc385) + %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<2x1xf32> loc(#loc385) + %tmp73 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc386) + %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<2x1xf32> loc(#loc386) + %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc387) + %tmp75 = tt.broadcast %tmp74 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc388) + %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<2x128xf32> loc(#loc388) + %tmp76 = arith.constant 2 : i32 loc(#loc389) + %tmp76_255 = arith.constant 2 : i32 loc(#loc389) + %tmp76_256 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc389) + %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x128xi32> loc(#loc389) + %tmp76_258 = arith.constant 1 : i32 loc(#loc390) + %tmp76_259 = arith.constant 1 : i32 loc(#loc390) + %tmp76_260 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc390) + %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x128xi32> loc(#loc390) + %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc391) + %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc392) + %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc392) + %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc393) + %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394) + %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc394) + %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc394) + %tmp76_269 = arith.truncf %tmp76_268 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc394) + %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc394) + %tmp76_271 = arith.extf %tmp76_270 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc395) + %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<2x128xf32> loc(#loc396) + %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397) + %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc397) + %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<2x128xf32> loc(#loc397) + %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398) + %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc398) + %tmp82 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc399) + %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc399) + %tmp83 = arith.constant 2 : i32 loc(#loc400) + %tmp83_276 = arith.constant 2 : i32 loc(#loc400) + %tmp83_277 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc400) + %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x128xi32> loc(#loc400) + %tmp83_279 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_280 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_281 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc401) + %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x128xi32> loc(#loc401) + %tmp83_283 = arith.constant 128 : i32 loc(#loc402) + %tmp83_284 = arith.constant 128 : i32 loc(#loc402) + %tmp83_285 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc402) + %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<2x1xi32> loc(#loc402) + %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc403) + %tmp83_288 = tt.broadcast %tmp83_286 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc403) + %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<2x128xi32> loc(#loc403) + %tmp83_290 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_291 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_292 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc404) + %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<2x1xi32> loc(#loc404) + %tmp83_294 = tt.broadcast %tmp83_293 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc405) + %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<2x128xi32> loc(#loc405) + %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc406) + %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc406) + %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc407) + %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408) + %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc408) + %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc408) + %tmp83_302 = arith.truncf %tmp83_301 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc408) + %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc408) + %tmp83_304 = arith.extf %tmp83_303 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc409) + %tmp85 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc410) + %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<2x1xf32> loc(#loc410) + %tmp86 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc411) + %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<2x1xf32> loc(#loc411) + %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc412) + %tmp88 = tt.broadcast %tmp87 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc413) + %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<2x128xf32> loc(#loc413) + %tmp89 = arith.constant 2 : i32 loc(#loc414) + %tmp89_308 = arith.constant 2 : i32 loc(#loc414) + %tmp89_309 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc414) + %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x128xi32> loc(#loc414) + %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc415) + %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc416) + %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc416) + %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc417) + %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418) + %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc418) + %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc418) + %tmp89_318 = arith.truncf %tmp89_317 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc418) + %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc418) + %tmp89_320 = arith.extf %tmp89_319 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc419) + %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<2x128xf32> loc(#loc420) + %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421) + %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc421) + %tmp94 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc422) + %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc422) + %tmp95 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc423) + %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc423) + %tmp98 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc424) + %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<2x1xf32> loc(#loc424) + %tmp99 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc425) + %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<2x1xf32> loc(#loc425) + %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc426) + %tmp101 = tt.broadcast %tmp100 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc427) + %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<2x128xf32> loc(#loc427) + %tmp104 = tt.broadcast %tmp102_110 : tensor<1x128xf32> -> tensor<2x128xf32> loc(#loc428) + %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<2x128xf32> loc(#loc428) + %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<2x128xf32> loc(#loc429) + %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<2x128xf32> loc(#loc430) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<2x128xf32> loc(#loc431) + %c128_i32_328 = arith.constant 128 : i32 loc(#loc204) + %c128_i32_329 = arith.constant 128 : i32 loc(#loc204) + %cst = arith.constant dense<128> : tensor<2x1xi32> loc(#loc204) + %8 = arith.muli %cst, %xindex_7 : tensor<2x1xi32> loc(#loc204) + %9 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc205) + %10 = tt.broadcast %8 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc205) + %11 = arith.addi %9, %10 : tensor<2x128xi32> loc(#loc205) + %12 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc206) + %13 = tt.addptr %12, %11 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc206) + %14 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc207) + %15 = arith.truncf %tmp68 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc207) + tt.store %13, %15, %14 : tensor<2x128x!tt.ptr> loc(#loc207) + %c128_i32_330 = arith.constant 128 : i32 loc(#loc208) + %c128_i32_331 = arith.constant 128 : i32 loc(#loc208) + %cst_332 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc208) + %16 = arith.muli %cst_332, %xindex_7 : tensor<2x1xi32> loc(#loc208) + %17 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc209) + %18 = tt.broadcast %16 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc209) + %19 = arith.addi %17, %18 : tensor<2x128xi32> loc(#loc209) + %20 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc210) + %21 = tt.addptr %20, %19 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc210) + %22 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc211) + %23 = arith.truncf %tmp110 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc211) + tt.store %21, %23, %22 : tensor<2x128x!tt.ptr> loc(#loc211) + } loc(#loc44) + tt.return loc(#loc212) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x128xf32> loc("input"(#loc213))) -> tensor<2xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214) + tt.reduce.return %2 : f32 loc(#loc214) + }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc214) + tt.return %0 : tensor<2xf32> loc(#loc216) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2xf32> loc(#loc217) + tt.return %1 : tensor<2xf32> loc(#loc217) + } loc(#loc213) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc219) + tt.return %0 : f32 loc(#loc220) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc221) + tt.return %1 : f32 loc(#loc221) + } loc(#loc218) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55) +#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66) +#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81) +#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57) +#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55) +#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63) +#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95) +#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42) +#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44) +#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55) +#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66) +#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81) +#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24) +#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24) +#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32) +#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53) +#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59) +#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91) +#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42) +#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24) +#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24) +#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33) +#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43) +#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39) +#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc231 = loc("xnumel"(#loc1)) +#loc232 = loc("r0_numel"(#loc2)) +#loc233 = loc("xoffset"(#loc3)) +#loc234 = loc("xoffset"(#loc4)) +#loc235 = loc("xindex"(#loc5)) +#loc236 = loc("xindex"(#loc6)) +#loc237 = loc("xindex"(#loc7)) +#loc238 = loc("xmask"(#loc8)) +#loc239 = loc("r0_base"(#loc9)) +#loc240 = loc("r0_base"(#loc10)) +#loc241 = loc("x0"(#loc11)) +#loc242 = loc("x1"(#loc12)) +#loc243 = loc("_tmp4"(#loc13)) +#loc244 = loc("_tmp10"(#loc14)) +#loc245 = loc("_tmp4"(#loc15)) +#loc246 = loc("r0_index"(#loc16)) +#loc247 = loc("r0_mask"(#loc17)) +#loc248 = loc("tmp0"(#loc18)) +#loc249 = loc("tmp0"(#loc19)) +#loc250 = loc("tmp0"(#loc20)) +#loc251 = loc("tmp0"(#loc21)) +#loc252 = loc("tmp0"(#loc22)) +#loc253 = loc("tmp0"(#loc23)) +#loc254 = loc("tmp0"(#loc24)) +#loc255 = loc("tmp0"(#loc25)) +#loc256 = loc("tmp6"(#loc26)) +#loc257 = loc("tmp6"(#loc27)) +#loc258 = loc("tmp6"(#loc28)) +#loc259 = loc("tmp6"(#loc29)) +#loc260 = loc("tmp6"(#loc30)) +#loc261 = loc("tmp6"(#loc31)) +#loc262 = loc("tmp6"(#loc32)) +#loc263 = loc("tmp2"(#loc33)) +#loc264 = loc("tmp5"(#loc34)) +#loc265 = loc("_tmp4"(#loc35)) +#loc266 = loc("tmp8"(#loc36)) +#loc267 = loc("tmp11"(#loc37)) +#loc268 = loc("_tmp10"(#loc38)) +#loc269 = loc("tmp4"(#loc40)) +#loc270 = loc("tmp4"(#loc41)) +#loc271 = loc("tmp10"(#loc42)) +#loc272 = loc("tmp10"(#loc43)) +#loc273 = loc("r0_index"(#loc45)) +#loc274 = loc("r0_mask"(#loc46)) +#loc275 = loc("r0_3"(#loc47)) +#loc276 = loc("r0_4"(#loc48)) +#loc277 = loc("tmp50"(#loc49)) +#loc278 = loc("tmp50"(#loc50)) +#loc279 = loc("tmp50"(#loc51)) +#loc280 = loc("tmp50"(#loc52)) +#loc281 = loc("tmp50"(#loc53)) +#loc282 = loc("tmp50"(#loc54)) +#loc283 = loc("tmp50"(#loc55)) +#loc284 = loc("tmp58"(#loc56)) +#loc285 = loc("tmp58"(#loc57)) +#loc286 = loc("tmp58"(#loc58)) +#loc287 = loc("tmp63"(#loc59)) +#loc288 = loc("tmp63"(#loc60)) +#loc289 = loc("tmp63"(#loc61)) +#loc290 = loc("tmp63"(#loc62)) +#loc291 = loc("tmp66"(#loc63)) +#loc292 = loc("tmp66"(#loc64)) +#loc293 = loc("tmp66"(#loc65)) +#loc294 = loc("tmp66"(#loc66)) +#loc295 = loc("tmp96"(#loc67)) +#loc296 = loc("tmp96"(#loc68)) +#loc297 = loc("tmp96"(#loc69)) +#loc298 = loc("tmp96"(#loc70)) +#loc299 = loc("tmp96"(#loc71)) +#loc300 = loc("tmp96"(#loc72)) +#loc301 = loc("tmp96"(#loc73)) +#loc302 = loc("tmp96"(#loc74)) +#loc303 = loc("tmp102"(#loc75)) +#loc304 = loc("tmp102"(#loc76)) +#loc305 = loc("tmp102"(#loc77)) +#loc306 = loc("tmp13"(#loc78)) +#loc307 = loc("tmp14"(#loc79)) +#loc308 = loc("tmp15"(#loc80)) +#loc309 = loc("tmp16"(#loc81)) +#loc310 = loc("tmp17"(#loc82)) +#loc311 = loc("tmp17"(#loc83)) +#loc312 = loc("tmp17"(#loc84)) +#loc313 = loc("tmp17"(#loc85)) +#loc314 = loc("tmp17"(#loc86)) +#loc315 = loc("tmp17"(#loc87)) +#loc316 = loc("tmp17"(#loc88)) +#loc317 = loc("tmp17"(#loc89)) +#loc318 = loc("tmp17"(#loc90)) +#loc319 = loc("tmp17"(#loc91)) +#loc320 = loc("tmp19"(#loc92)) +#loc321 = loc("tmp20"(#loc93)) +#loc322 = loc("tmp21"(#loc94)) +#loc323 = loc("tmp22"(#loc95)) +#loc324 = loc("tmp23"(#loc96)) +#loc325 = loc("tmp24"(#loc97)) +#loc326 = loc("tmp25"(#loc98)) +#loc327 = loc("tmp25"(#loc99)) +#loc328 = loc("tmp25"(#loc100)) +#loc329 = loc("tmp25"(#loc101)) +#loc330 = loc("tmp25"(#loc102)) +#loc331 = loc("tmp25"(#loc103)) +#loc332 = loc("tmp25"(#loc104)) +#loc333 = loc("tmp27"(#loc105)) +#loc334 = loc("tmp29"(#loc106)) +#loc335 = loc("tmp30"(#loc107)) +#loc336 = loc("tmp31"(#loc108)) +#loc337 = loc("tmp32"(#loc109)) +#loc338 = loc("tmp33"(#loc110)) +#loc339 = loc("tmp34"(#loc111)) +#loc340 = loc("tmp35"(#loc112)) +#loc341 = loc("tmp35"(#loc113)) +#loc342 = loc("tmp35"(#loc114)) +#loc343 = loc("tmp35"(#loc115)) +#loc344 = loc("tmp35"(#loc116)) +#loc345 = loc("tmp35"(#loc117)) +#loc346 = loc("tmp35"(#loc118)) +#loc347 = loc("tmp35"(#loc119)) +#loc348 = loc("tmp35"(#loc120)) +#loc349 = loc("tmp37"(#loc121)) +#loc350 = loc("tmp38"(#loc122)) +#loc351 = loc("tmp39"(#loc123)) +#loc352 = loc("tmp40"(#loc124)) +#loc353 = loc("tmp41"(#loc125)) +#loc354 = loc("tmp42"(#loc126)) +#loc355 = loc("tmp43"(#loc127)) +#loc356 = loc("tmp43"(#loc128)) +#loc357 = loc("tmp43"(#loc129)) +#loc358 = loc("tmp43"(#loc130)) +#loc359 = loc("tmp43"(#loc131)) +#loc360 = loc("tmp43"(#loc132)) +#loc361 = loc("tmp45"(#loc133)) +#loc362 = loc("tmp47"(#loc134)) +#loc363 = loc("tmp48"(#loc135)) +#loc364 = loc("tmp49"(#loc136)) +#loc365 = loc("tmp52"(#loc137)) +#loc366 = loc("tmp53"(#loc138)) +#loc367 = loc("tmp54"(#loc139)) +#loc368 = loc("tmp55"(#loc140)) +#loc369 = loc("tmp56"(#loc141)) +#loc370 = loc("tmp57"(#loc142)) +#loc371 = loc("tmp60"(#loc143)) +#loc372 = loc("tmp64"(#loc144)) +#loc373 = loc("tmp67"(#loc145)) +#loc374 = loc("tmp68"(#loc146)) +#loc375 = loc("tmp70"(#loc147)) +#loc376 = loc("tmp70"(#loc148)) +#loc377 = loc("tmp70"(#loc149)) +#loc378 = loc("tmp70"(#loc150)) +#loc379 = loc("tmp70"(#loc151)) +#loc380 = loc("tmp70"(#loc152)) +#loc381 = loc("tmp70"(#loc153)) +#loc382 = loc("tmp70"(#loc154)) +#loc383 = loc("tmp70"(#loc155)) +#loc384 = loc("tmp70"(#loc156)) +#loc385 = loc("tmp72"(#loc157)) +#loc386 = loc("tmp73"(#loc158)) +#loc387 = loc("tmp74"(#loc159)) +#loc388 = loc("tmp75"(#loc160)) +#loc389 = loc("tmp76"(#loc161)) +#loc390 = loc("tmp76"(#loc162)) +#loc391 = loc("tmp76"(#loc163)) +#loc392 = loc("tmp76"(#loc164)) +#loc393 = loc("tmp76"(#loc165)) +#loc394 = loc("tmp76"(#loc166)) +#loc395 = loc("tmp76"(#loc167)) +#loc396 = loc("tmp78"(#loc168)) +#loc397 = loc("tmp80"(#loc169)) +#loc398 = loc("tmp81"(#loc170)) +#loc399 = loc("tmp82"(#loc171)) +#loc400 = loc("tmp83"(#loc172)) +#loc401 = loc("tmp83"(#loc173)) +#loc402 = loc("tmp83"(#loc174)) +#loc403 = loc("tmp83"(#loc175)) +#loc404 = loc("tmp83"(#loc176)) +#loc405 = loc("tmp83"(#loc177)) +#loc406 = loc("tmp83"(#loc178)) +#loc407 = loc("tmp83"(#loc179)) +#loc408 = loc("tmp83"(#loc180)) +#loc409 = loc("tmp83"(#loc181)) +#loc410 = loc("tmp85"(#loc182)) +#loc411 = loc("tmp86"(#loc183)) +#loc412 = loc("tmp87"(#loc184)) +#loc413 = loc("tmp88"(#loc185)) +#loc414 = loc("tmp89"(#loc186)) +#loc415 = loc("tmp89"(#loc187)) +#loc416 = loc("tmp89"(#loc188)) +#loc417 = loc("tmp89"(#loc189)) +#loc418 = loc("tmp89"(#loc190)) +#loc419 = loc("tmp89"(#loc191)) +#loc420 = loc("tmp91"(#loc192)) +#loc421 = loc("tmp93"(#loc193)) +#loc422 = loc("tmp94"(#loc194)) +#loc423 = loc("tmp95"(#loc195)) +#loc424 = loc("tmp98"(#loc196)) +#loc425 = loc("tmp99"(#loc197)) +#loc426 = loc("tmp100"(#loc198)) +#loc427 = loc("tmp101"(#loc199)) +#loc428 = loc("tmp104"(#loc200)) +#loc429 = loc("tmp107"(#loc201)) +#loc430 = loc("tmp109"(#loc202)) +#loc431 = loc("tmp110"(#loc203)) +#loc435 = loc("_tmp10"(#loc245)) diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..a047651df08aa291fb23d94c576ff2053ff171ba --- /dev/null +++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir @@ -0,0 +1,495 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc130 = loc("in_out_ptr0"(#loc)) +#loc131 = loc("in_out_ptr1"(#loc)) +#loc132 = loc("in_ptr0"(#loc)) +#loc133 = loc("in_ptr1"(#loc)) +#loc134 = loc("in_ptr2"(#loc)) +#loc135 = loc("in_ptr3"(#loc)) +#loc136 = loc("in_ptr4"(#loc)) +#loc137 = loc("xnumel"(#loc)) +#loc138 = loc("r0_numel"(#loc)) +#loc166 = loc("tmp4"(#loc30)) +#loc168 = loc("tmp10"(#loc33)) +#loc259 = loc(callsite(#loc1 at #loc166)) +#loc261 = loc(callsite(#loc1 at #loc168)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4097> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x128xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<36864> : tensor<2x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<36864> : tensor<2x1xi32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<128> : tensor<2x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<2x1xi32, #blocked1> loc(#loc1) + %cst_7 = arith.constant dense<4096> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<4096> : tensor<1x128xi32, #blocked1> loc(#loc1) + %cst_9 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x128xi32, #blocked2> loc(#loc1) + %cst_11 = arith.constant dense<128> : tensor<1x128xi32, #blocked1> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<2x1xi32, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<32> : tensor<2x1xi32, #blocked1> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked1> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<1x128xbf16, #blocked2> loc(#loc1) + %cst_16 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked> loc(#loc1) + %cst_17 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32, #blocked1> loc(#loc1) + %cst_18 = arith.constant dense<1.280000e+02> : tensor<2x1xf32, #blocked1> loc(#loc1) + %cst_19 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked> loc(#loc1) + %cst_20 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc139) + %xoffset_21 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc140) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc141) + %xindex_22 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc141) + %xindex_23 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xi32, #blocked1> loc(#loc141) + %xindex_24 = tt.expand_dims %xindex_22 {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2x1xi32, #blocked> loc(#loc141) + %xindex_25 = tt.splat %xoffset_21 : i32 -> tensor<2x1xi32, #blocked1> loc(#loc142) + %xindex_26 = tt.splat %xoffset_21 : i32 -> tensor<2x1xi32, #blocked> loc(#loc142) + %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<2x1xi32, #blocked1> loc(#loc142) + %xindex_28 = arith.addi %xindex_26, %xindex_24 : tensor<2x1xi32, #blocked> loc(#loc142) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc143) + %r0_base_29 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> loc(#loc143) + %r0_base_30 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc143) + %r0_base_31 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc143) + %r0_base_32 = tt.expand_dims %r0_base_29 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x128xi32, #blocked2> loc(#loc143) + %r0_base_33 = tt.expand_dims %r0_base_30 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc143) + %x0 = arith.remsi %xindex_27, %cst_13 : tensor<2x1xi32, #blocked1> loc(#loc144) + %x0_34 = arith.remsi %xindex_28, %cst_12 : tensor<2x1xi32, #blocked> loc(#loc144) + %x1 = arith.divsi %xindex_27, %cst_13 : tensor<2x1xi32, #blocked1> loc(#loc145) + %x1_35 = arith.divsi %xindex_28, %cst_12 : tensor<2x1xi32, #blocked> loc(#loc145) + %r0_mask = arith.cmpi slt, %r0_base_31, %cst_11 : tensor<1x128xi32, #blocked1> loc(#loc146) + %r0_mask_36 = arith.cmpi slt, %r0_base_32, %cst_10 : tensor<1x128xi32, #blocked2> loc(#loc146) + %r0_mask_37 = arith.cmpi slt, %r0_base_33, %cst_9 : tensor<1x128xi32, #blocked> loc(#loc146) + %tmp0 = arith.addi %r0_base_31, %cst_8 : tensor<1x128xi32, #blocked1> loc(#loc147) + %tmp0_38 = arith.muli %x0, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc148) + %tmp0_39 = arith.muli %x0_34, %cst_5 : tensor<2x1xi32, #blocked> loc(#loc148) + %tmp0_40 = tt.broadcast %tmp0 : tensor<1x128xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc149) + %tmp0_41 = tt.broadcast %tmp0_38 : tensor<2x1xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc149) + %tmp0_42 = tt.broadcast %tmp0_39 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc149) + %tmp0_43 = arith.addi %tmp0_40, %tmp0_41 : tensor<2x128xi32, #blocked1> loc(#loc149) + %tmp0_44 = arith.muli %x1, %cst_4 : tensor<2x1xi32, #blocked1> loc(#loc150) + %tmp0_45 = arith.muli %x1_35, %cst_3 : tensor<2x1xi32, #blocked> loc(#loc150) + %tmp0_46 = tt.broadcast %tmp0_44 : tensor<2x1xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc151) + %tmp0_47 = tt.broadcast %tmp0_45 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc151) + %tmp0_48 = arith.addi %tmp0_43, %tmp0_46 : tensor<2x128xi32, #blocked1> loc(#loc151) + %tmp0_49 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr, #blocked1> loc(#loc152) + %tmp0_50 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr, #blocked> loc(#loc152) + %tmp0_51 = tt.addptr %tmp0_49, %tmp0_48 : tensor<2x128x!tt.ptr, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc152) + %tmp0_52 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked1> -> tensor<2x128xi1, #blocked1> loc(#loc153) + %tmp0_53 = tt.load %tmp0_51, %tmp0_52, %cst_14 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked1> loc(#loc153) + %tmp0_54 = arith.extf %tmp0_53 : tensor<2x128xbf16, #blocked1> to tensor<2x128xf32, #blocked1> loc(#loc154) + %tmp6 = tt.broadcast %r0_base_31 : tensor<1x128xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc155) + %tmp6_55 = arith.addi %tmp6, %tmp0_41 : tensor<2x128xi32, #blocked1> loc(#loc155) + %tmp6_56 = arith.addi %tmp6_55, %tmp0_46 : tensor<2x128xi32, #blocked1> loc(#loc156) + %tmp6_57 = tt.addptr %tmp0_49, %tmp6_56 : tensor<2x128x!tt.ptr, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc157) + %tmp6_58 = tt.load %tmp6_57, %tmp0_52, %cst_14 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked1> loc(#loc158) + %tmp6_59 = arith.extf %tmp6_58 : tensor<2x128xbf16, #blocked1> to tensor<2x128xf32, #blocked1> loc(#loc159) + %tmp2 = arith.mulf %tmp0_54, %tmp0_54 : tensor<2x128xf32, #blocked1> loc(#loc160) + %tmp5 = arith.addf %tmp2, %cst_20 : tensor<2x128xf32, #blocked1> loc(#loc161) + %_tmp4 = arith.select %tmp0_52, %tmp5, %cst_20 : tensor<2x128xi1, #blocked1>, tensor<2x128xf32, #blocked1> loc(#loc162) + %tmp8 = arith.mulf %tmp6_59, %tmp6_59 : tensor<2x128xf32, #blocked1> loc(#loc163) + %tmp11 = arith.addf %tmp8, %cst_20 : tensor<2x128xf32, #blocked1> loc(#loc164) + %_tmp10 = arith.select %tmp0_52, %tmp11, %cst_20 : tensor<2x128xi1, #blocked1>, tensor<2x128xf32, #blocked1> loc(#loc165) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_134: f32 loc(callsite(#loc1 at #loc166)), %tmp4_135: f32 loc(callsite(#loc1 at #loc166))): + %tmp4_136 = arith.addf %tmp4_134, %tmp4_135 : f32 loc(#loc264) + tt.reduce.return %tmp4_136 : f32 loc(#loc258) + }) : (tensor<2x128xf32, #blocked1>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc258) + %tmp4_60 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc167) + %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_134: f32 loc(callsite(#loc1 at #loc168)), %tmp10_135: f32 loc(callsite(#loc1 at #loc168))): + %tmp10_136 = arith.addf %tmp10_134, %tmp10_135 : f32 loc(#loc265) + tt.reduce.return %tmp10_136 : f32 loc(#loc260) + }) : (tensor<2x128xf32, #blocked1>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc260) + %tmp10_61 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc169) + %r0_3 = arith.remsi %r0_base_33, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc170) + %r0_4 = arith.divsi %r0_base_33, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc171) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked2> loc(#loc172) + %tmp58_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc172) + %tmp58_63 = tt.addptr %tmp58, %r0_base_32 : tensor<1x128x!tt.ptr, #blocked2>, tensor<1x128xi32, #blocked2> loc(#loc172) + %tmp58_64 = tt.load %tmp58_63, %r0_mask_36, %cst_15 evictionPolicy = evict_last : tensor<1x128x!tt.ptr, #blocked2> loc(#loc173) + %tmp58_65 = arith.extf %tmp58_64 : tensor<1x128xbf16, #blocked2> to tensor<1x128xf32, #blocked2> loc(#loc174) + %tmp63 = arith.muli %x1, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc175) + %tmp63_66 = tt.broadcast %tmp63 : tensor<2x1xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc176) + %tmp63_67 = arith.addi %tmp6, %tmp63_66 : tensor<2x128xi32, #blocked1> loc(#loc176) + %tmp63_68 = tt.splat %in_ptr2 : !tt.ptr -> tensor<2x128x!tt.ptr, #blocked1> loc(#loc177) + %tmp63_69 = tt.addptr %tmp63_68, %tmp63_67 : tensor<2x128x!tt.ptr, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc177) + %tmp63_70 = tt.load %tmp63_69, %tmp0_52, %cst_20 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked1> loc(#loc178) + %tmp63_71 = ttg.convert_layout %tmp63_70 : tensor<2x128xf32, #blocked1> -> tensor<2x128xf32, #blocked> loc(#loc178) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<2x128x!tt.ptr, #blocked1> loc(#loc179) + %tmp66_72 = tt.addptr %tmp66, %tmp63_67 : tensor<2x128x!tt.ptr, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc179) + %tmp66_73 = tt.load %tmp66_72, %tmp0_52, %cst_20 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked1> loc(#loc180) + %tmp66_74 = ttg.convert_layout %tmp66_73 : tensor<2x128xf32, #blocked1> -> tensor<2x128xf32, #blocked> loc(#loc180) + %tmp96 = tt.load %tmp0_51, %tmp0_52, %cst_14 evictionPolicy = evict_first : tensor<2x128x!tt.ptr, #blocked1> loc(#loc181) + %tmp96_75 = arith.extf %tmp96 : tensor<2x128xbf16, #blocked1> to tensor<2x128xf32, #blocked1> loc(#loc182) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked2> loc(#loc183) + %tmp102_76 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc183) + %tmp102_77 = tt.addptr %tmp102, %r0_base_32 : tensor<1x128x!tt.ptr, #blocked2>, tensor<1x128xi32, #blocked2> loc(#loc183) + %tmp102_78 = tt.load %tmp102_77, %r0_mask_36, %cst_15 evictionPolicy = evict_last : tensor<1x128x!tt.ptr, #blocked2> loc(#loc184) + %tmp102_79 = arith.extf %tmp102_78 : tensor<1x128xbf16, #blocked2> to tensor<1x128xf32, #blocked2> loc(#loc185) + %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked> loc(#loc186) + %tmp16_80 = arith.cmpi slt, %tmp16, %cst_1 : tensor<1x128xi64, #blocked> loc(#loc186) + %tmp17 = arith.muli %r0_4, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc187) + %tmp17_81 = arith.addi %tmp17, %cst_0 : tensor<1x128xi32, #blocked> loc(#loc188) + %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc189) + %tmp17_83 = arith.addi %tmp17_82, %tmp0_42 : tensor<2x128xi32, #blocked> loc(#loc189) + %tmp17_84 = arith.addi %tmp17_83, %tmp0_47 : tensor<2x128xi32, #blocked> loc(#loc190) + %tmp17_85 = tt.addptr %tmp0_50, %tmp17_84 : tensor<2x128x!tt.ptr, #blocked>, tensor<2x128xi32, #blocked> loc(#loc191) + %tmp17_86 = arith.andi %r0_mask_37, %tmp16_80 : tensor<1x128xi1, #blocked> loc(#loc192) + %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc193) + %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked> loc(#loc193) + %tmp17_89 = arith.extf %tmp17_88 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc194) + %tmp20 = arith.divf %tmp10_61, %cst_18 : tensor<2x1xf32, #blocked1> loc(#loc195) + %tmp22 = arith.addf %tmp20, %cst_17 : tensor<2x1xf32, #blocked1> loc(#loc196) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked1>) -> tensor<2x1xf32, #blocked1> loc(#loc197) + %tmp24 = ttg.convert_layout %tmp23 : tensor<2x1xf32, #blocked1> -> tensor<2x1xf32, #blocked> loc(#loc198) + %tmp24_90 = tt.broadcast %tmp24 : tensor<2x1xf32, #blocked> -> tensor<2x128xf32, #blocked> loc(#loc198) + %tmp24_91 = tt.broadcast %tmp23 : tensor<2x1xf32, #blocked1> -> tensor<2x128xf32, #blocked1> loc(#loc198) + %tmp24_92 = arith.mulf %tmp17_89, %tmp24_90 : tensor<2x128xf32, #blocked> loc(#loc198) + %tmp25 = tt.addptr %tmp58_62, %tmp17_81 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc199) + %tmp25_93 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr, #blocked> -> tensor<2x128x!tt.ptr, #blocked> loc(#loc199) + %tmp25_94 = tt.load %tmp25_93, %tmp17_87, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked> loc(#loc200) + %tmp25_95 = arith.extf %tmp25_94 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc201) + %tmp27 = arith.mulf %tmp24_92, %tmp25_95 : tensor<2x128xf32, #blocked> loc(#loc202) + %tmp29 = arith.subf %cst_19, %tmp27 : tensor<2x128xf32, #blocked> loc(#loc203) + %tmp31 = tt.broadcast %tmp16_80 : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc204) + %tmp32 = arith.cmpi sge, %tmp16, %cst_1 : tensor<1x128xi64, #blocked> loc(#loc205) + %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc206) + %tmp35_96 = arith.addi %tmp35, %tmp0_42 : tensor<2x128xi32, #blocked> loc(#loc206) + %tmp35_97 = arith.addi %tmp35_96, %tmp0_47 : tensor<2x128xi32, #blocked> loc(#loc207) + %tmp35_98 = tt.addptr %tmp0_50, %tmp35_97 : tensor<2x128x!tt.ptr, #blocked>, tensor<2x128xi32, #blocked> loc(#loc208) + %tmp35_99 = arith.andi %r0_mask_37, %tmp32 : tensor<1x128xi1, #blocked> loc(#loc209) + %tmp35_100 = tt.broadcast %tmp35_99 : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc210) + %tmp35_101 = tt.load %tmp35_98, %tmp35_100, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked> loc(#loc210) + %tmp35_102 = arith.extf %tmp35_101 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc211) + %tmp42 = arith.mulf %tmp35_102, %tmp24_90 : tensor<2x128xf32, #blocked> loc(#loc212) + %tmp43 = tt.addptr %tmp58_62, %tmp17 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc213) + %tmp43_103 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr, #blocked> -> tensor<2x128x!tt.ptr, #blocked> loc(#loc213) + %tmp43_104 = tt.load %tmp43_103, %tmp35_100, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked> loc(#loc214) + %tmp43_105 = arith.extf %tmp43_104 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc215) + %tmp45 = arith.mulf %tmp42, %tmp43_105 : tensor<2x128xf32, #blocked> loc(#loc216) + %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc217) + %tmp48_106 = arith.select %tmp48, %tmp45, %cst_19 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc217) + %tmp49 = arith.select %tmp31, %tmp29, %tmp48_106 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc262) + %tmp57 = arith.mulf %tmp6_59, %tmp24_91 : tensor<2x128xf32, #blocked1> loc(#loc219) + %tmp60 = ttg.convert_layout %tmp58_65 : tensor<1x128xf32, #blocked2> -> tensor<1x128xf32, #blocked1> loc(#loc220) + %tmp60_107 = tt.broadcast %tmp60 : tensor<1x128xf32, #blocked1> -> tensor<2x128xf32, #blocked1> loc(#loc220) + %tmp60_108 = arith.mulf %tmp57, %tmp60_107 : tensor<2x128xf32, #blocked1> loc(#loc220) + %tmp64 = arith.mulf %tmp60_108, %tmp63_70 : tensor<2x128xf32, #blocked1> loc(#loc221) + %tmp64_109 = ttg.convert_layout %tmp64 : tensor<2x128xf32, #blocked1> -> tensor<2x128xf32, #blocked> loc(#loc221) + %tmp67 = arith.mulf %tmp49, %tmp66_74 : tensor<2x128xf32, #blocked> loc(#loc222) + %tmp68 = arith.addf %tmp64_109, %tmp67 : tensor<2x128xf32, #blocked> loc(#loc223) + %tmp70 = arith.addi %tmp17, %cst : tensor<1x128xi32, #blocked> loc(#loc224) + %tmp70_110 = tt.broadcast %tmp70 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc225) + %tmp70_111 = arith.addi %tmp70_110, %tmp0_42 : tensor<2x128xi32, #blocked> loc(#loc225) + %tmp70_112 = arith.addi %tmp70_111, %tmp0_47 : tensor<2x128xi32, #blocked> loc(#loc226) + %tmp70_113 = tt.addptr %tmp0_50, %tmp70_112 : tensor<2x128x!tt.ptr, #blocked>, tensor<2x128xi32, #blocked> loc(#loc227) + %tmp70_114 = tt.load %tmp70_113, %tmp17_87, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked> loc(#loc228) + %tmp70_115 = arith.extf %tmp70_114 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc229) + %tmp72 = arith.divf %tmp4_60, %cst_18 : tensor<2x1xf32, #blocked1> loc(#loc230) + %tmp73 = arith.addf %tmp72, %cst_17 : tensor<2x1xf32, #blocked1> loc(#loc231) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked1>) -> tensor<2x1xf32, #blocked1> loc(#loc232) + %tmp75 = ttg.convert_layout %tmp74 : tensor<2x1xf32, #blocked1> -> tensor<2x1xf32, #blocked> loc(#loc233) + %tmp75_116 = tt.broadcast %tmp75 : tensor<2x1xf32, #blocked> -> tensor<2x128xf32, #blocked> loc(#loc233) + %tmp75_117 = tt.broadcast %tmp74 : tensor<2x1xf32, #blocked1> -> tensor<2x128xf32, #blocked1> loc(#loc233) + %tmp75_118 = arith.mulf %tmp70_115, %tmp75_116 : tensor<2x128xf32, #blocked> loc(#loc233) + %tmp76 = tt.addptr %tmp102_76, %tmp17_81 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc234) + %tmp76_119 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr, #blocked> -> tensor<2x128x!tt.ptr, #blocked> loc(#loc234) + %tmp76_120 = tt.load %tmp76_119, %tmp17_87, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked> loc(#loc235) + %tmp76_121 = arith.extf %tmp76_120 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc236) + %tmp78 = arith.mulf %tmp75_118, %tmp76_121 : tensor<2x128xf32, #blocked> loc(#loc237) + %tmp80 = arith.subf %cst_19, %tmp78 : tensor<2x128xf32, #blocked> loc(#loc238) + %tmp83 = arith.addi %tmp17, %cst_7 : tensor<1x128xi32, #blocked> loc(#loc239) + %tmp83_122 = tt.broadcast %tmp83 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc240) + %tmp83_123 = arith.addi %tmp83_122, %tmp0_42 : tensor<2x128xi32, #blocked> loc(#loc240) + %tmp83_124 = arith.addi %tmp83_123, %tmp0_47 : tensor<2x128xi32, #blocked> loc(#loc241) + %tmp83_125 = tt.addptr %tmp0_50, %tmp83_124 : tensor<2x128x!tt.ptr, #blocked>, tensor<2x128xi32, #blocked> loc(#loc242) + %tmp83_126 = tt.load %tmp83_125, %tmp35_100, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked> loc(#loc243) + %tmp83_127 = arith.extf %tmp83_126 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc244) + %tmp88 = arith.mulf %tmp83_127, %tmp75_116 : tensor<2x128xf32, #blocked> loc(#loc245) + %tmp89 = tt.addptr %tmp102_76, %tmp17 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc246) + %tmp89_128 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr, #blocked> -> tensor<2x128x!tt.ptr, #blocked> loc(#loc246) + %tmp89_129 = tt.load %tmp89_128, %tmp35_100, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr, #blocked> loc(#loc247) + %tmp89_130 = arith.extf %tmp89_129 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc248) + %tmp91 = arith.mulf %tmp88, %tmp89_130 : tensor<2x128xf32, #blocked> loc(#loc249) + %tmp94 = arith.select %tmp48, %tmp91, %cst_19 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc250) + %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc263) + %tmp101 = arith.mulf %tmp96_75, %tmp75_117 : tensor<2x128xf32, #blocked1> loc(#loc253) + %tmp101_131 = ttg.convert_layout %tmp101 : tensor<2x128xf32, #blocked1> -> tensor<2x128xf32, #blocked> loc(#loc253) + %tmp107 = ttg.convert_layout %tmp102_79 : tensor<1x128xf32, #blocked2> -> tensor<1x128xf32, #blocked> loc(#loc254) + %tmp104 = tt.broadcast %tmp107 : tensor<1x128xf32, #blocked> -> tensor<2x128xf32, #blocked> loc(#loc255) + %tmp104_132 = arith.mulf %tmp101_131, %tmp104 : tensor<2x128xf32, #blocked> loc(#loc255) + %tmp107_133 = arith.mulf %tmp104_132, %tmp63_71 : tensor<2x128xf32, #blocked> loc(#loc254) + %tmp109 = arith.mulf %tmp95, %tmp66_74 : tensor<2x128xf32, #blocked> loc(#loc256) + %tmp110 = arith.addf %tmp107_133, %tmp109 : tensor<2x128xf32, #blocked> loc(#loc257) + %0 = arith.muli %xindex_27, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc123) + %1 = tt.broadcast %0 : tensor<2x1xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc124) + %2 = arith.addi %tmp6, %1 : tensor<2x128xi32, #blocked1> loc(#loc124) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr, #blocked1> loc(#loc125) + %4 = tt.addptr %3, %2 : tensor<2x128x!tt.ptr, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc125) + %5 = arith.truncf %tmp68 : tensor<2x128xf32, #blocked> to tensor<2x128xbf16, #blocked> loc(#loc126) + %6 = ttg.convert_layout %5 : tensor<2x128xbf16, #blocked> -> tensor<2x128xbf16, #blocked1> loc(#loc126) + tt.store %4, %6, %tmp0_52 : tensor<2x128x!tt.ptr, #blocked1> loc(#loc126) + %7 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<2x128x!tt.ptr, #blocked1> loc(#loc127) + %8 = tt.addptr %7, %2 : tensor<2x128x!tt.ptr, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc127) + %9 = arith.truncf %tmp110 : tensor<2x128xf32, #blocked> to tensor<2x128xbf16, #blocked> loc(#loc128) + %10 = ttg.convert_layout %9 : tensor<2x128xbf16, #blocked> -> tensor<2x128xbf16, #blocked1> loc(#loc128) + tt.store %8, %10, %tmp0_52 : tensor<2x128x!tt.ptr, #blocked1> loc(#loc128) + tt.return loc(#loc129) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc139 = loc("xoffset"(#loc2)) +#loc140 = loc("xoffset"(#loc3)) +#loc141 = loc("xindex"(#loc4)) +#loc142 = loc("xindex"(#loc5)) +#loc143 = loc("r0_base"(#loc6)) +#loc144 = loc("x0"(#loc7)) +#loc145 = loc("x1"(#loc8)) +#loc146 = loc("r0_mask"(#loc9)) +#loc147 = loc("tmp0"(#loc10)) +#loc148 = loc("tmp0"(#loc11)) +#loc149 = loc("tmp0"(#loc12)) +#loc150 = loc("tmp0"(#loc13)) +#loc151 = loc("tmp0"(#loc14)) +#loc152 = loc("tmp0"(#loc15)) +#loc153 = loc("tmp0"(#loc16)) +#loc154 = loc("tmp0"(#loc17)) +#loc155 = loc("tmp6"(#loc18)) +#loc156 = loc("tmp6"(#loc19)) +#loc157 = loc("tmp6"(#loc20)) +#loc158 = loc("tmp6"(#loc21)) +#loc159 = loc("tmp6"(#loc22)) +#loc160 = loc("tmp2"(#loc23)) +#loc161 = loc("tmp5"(#loc24)) +#loc162 = loc("_tmp4"(#loc25)) +#loc163 = loc("tmp8"(#loc26)) +#loc164 = loc("tmp11"(#loc27)) +#loc165 = loc("_tmp10"(#loc28)) +#loc167 = loc("tmp4"(#loc32)) +#loc169 = loc("tmp10"(#loc34)) +#loc170 = loc("r0_3"(#loc35)) +#loc171 = loc("r0_4"(#loc36)) +#loc172 = loc("tmp58"(#loc37)) +#loc173 = loc("tmp58"(#loc38)) +#loc174 = loc("tmp58"(#loc39)) +#loc175 = loc("tmp63"(#loc40)) +#loc176 = loc("tmp63"(#loc41)) +#loc177 = loc("tmp63"(#loc42)) +#loc178 = loc("tmp63"(#loc43)) +#loc179 = loc("tmp66"(#loc44)) +#loc180 = loc("tmp66"(#loc45)) +#loc181 = loc("tmp96"(#loc46)) +#loc182 = loc("tmp96"(#loc47)) +#loc183 = loc("tmp102"(#loc48)) +#loc184 = loc("tmp102"(#loc49)) +#loc185 = loc("tmp102"(#loc50)) +#loc186 = loc("tmp16"(#loc51)) +#loc187 = loc("tmp17"(#loc52)) +#loc188 = loc("tmp17"(#loc53)) +#loc189 = loc("tmp17"(#loc54)) +#loc190 = loc("tmp17"(#loc55)) +#loc191 = loc("tmp17"(#loc56)) +#loc192 = loc("tmp17"(#loc57)) +#loc193 = loc("tmp17"(#loc58)) +#loc194 = loc("tmp17"(#loc59)) +#loc195 = loc("tmp20"(#loc60)) +#loc196 = loc("tmp22"(#loc61)) +#loc197 = loc("tmp23"(#loc62)) +#loc198 = loc("tmp24"(#loc63)) +#loc199 = loc("tmp25"(#loc64)) +#loc200 = loc("tmp25"(#loc65)) +#loc201 = loc("tmp25"(#loc66)) +#loc202 = loc("tmp27"(#loc67)) +#loc203 = loc("tmp29"(#loc68)) +#loc204 = loc("tmp31"(#loc69)) +#loc205 = loc("tmp32"(#loc70)) +#loc206 = loc("tmp35"(#loc71)) +#loc207 = loc("tmp35"(#loc72)) +#loc208 = loc("tmp35"(#loc73)) +#loc209 = loc("tmp35"(#loc74)) +#loc210 = loc("tmp35"(#loc75)) +#loc211 = loc("tmp35"(#loc76)) +#loc212 = loc("tmp42"(#loc77)) +#loc213 = loc("tmp43"(#loc78)) +#loc214 = loc("tmp43"(#loc79)) +#loc215 = loc("tmp43"(#loc80)) +#loc216 = loc("tmp45"(#loc81)) +#loc217 = loc("tmp48"(#loc82)) +#loc218 = loc("tmp49"(#loc83)) +#loc219 = loc("tmp57"(#loc84)) +#loc220 = loc("tmp60"(#loc85)) +#loc221 = loc("tmp64"(#loc86)) +#loc222 = loc("tmp67"(#loc87)) +#loc223 = loc("tmp68"(#loc88)) +#loc224 = loc("tmp70"(#loc89)) +#loc225 = loc("tmp70"(#loc90)) +#loc226 = loc("tmp70"(#loc91)) +#loc227 = loc("tmp70"(#loc92)) +#loc228 = loc("tmp70"(#loc93)) +#loc229 = loc("tmp70"(#loc94)) +#loc230 = loc("tmp72"(#loc95)) +#loc231 = loc("tmp73"(#loc96)) +#loc232 = loc("tmp74"(#loc97)) +#loc233 = loc("tmp75"(#loc98)) +#loc234 = loc("tmp76"(#loc99)) +#loc235 = loc("tmp76"(#loc100)) +#loc236 = loc("tmp76"(#loc101)) +#loc237 = loc("tmp78"(#loc102)) +#loc238 = loc("tmp80"(#loc103)) +#loc239 = loc("tmp83"(#loc104)) +#loc240 = loc("tmp83"(#loc105)) +#loc241 = loc("tmp83"(#loc106)) +#loc242 = loc("tmp83"(#loc107)) +#loc243 = loc("tmp83"(#loc108)) +#loc244 = loc("tmp83"(#loc109)) +#loc245 = loc("tmp88"(#loc110)) +#loc246 = loc("tmp89"(#loc111)) +#loc247 = loc("tmp89"(#loc112)) +#loc248 = loc("tmp89"(#loc113)) +#loc249 = loc("tmp91"(#loc114)) +#loc250 = loc("tmp94"(#loc115)) +#loc251 = loc("tmp95"(#loc116)) +#loc252 = loc("tmp82"(#loc117)) +#loc253 = loc("tmp101"(#loc118)) +#loc254 = loc("tmp107"(#loc119)) +#loc255 = loc("tmp104"(#loc120)) +#loc256 = loc("tmp109"(#loc121)) +#loc257 = loc("tmp110"(#loc122)) +#loc258 = loc(callsite(#loc29 at #loc166)) +#loc260 = loc(callsite(#loc29 at #loc168)) +#loc262 = loc(fused[#loc218, #loc204]) +#loc263 = loc(fused[#loc251, #loc252]) +#loc264 = loc(callsite(#loc31 at #loc258)) +#loc265 = loc(callsite(#loc31 at #loc260)) diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..393fb5118cb07cd0c2dc1a641c6a9ee3dbedd7e6 --- /dev/null +++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir @@ -0,0 +1,457 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc132 = loc("in_out_ptr0"(#loc)) +#loc133 = loc("in_out_ptr1"(#loc)) +#loc134 = loc("in_ptr0"(#loc)) +#loc135 = loc("in_ptr1"(#loc)) +#loc136 = loc("in_ptr2"(#loc)) +#loc137 = loc("in_ptr3"(#loc)) +#loc138 = loc("in_ptr4"(#loc)) +#loc139 = loc("xnumel"(#loc)) +#loc140 = loc("r0_numel"(#loc)) +#loc170 = loc("tmp4"(#loc32)) +#loc172 = loc("tmp10"(#loc35)) +#loc263 = loc(callsite(#loc1 at #loc170)) +#loc265 = loc(callsite(#loc1 at #loc172)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x128xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16> loc(#loc1) + %cst_1 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc1) + %cst_6 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc1) + %cst_7 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc141) + %xoffset_13 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc142) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc143) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc144) + %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<2x1xi32> loc(#loc145) + %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<2x1xi32> loc(#loc145) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc146) + %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc147) + %x0 = arith.remsi %xindex_16, %cst_12 : tensor<2x1xi32> loc(#loc148) + %x1 = arith.divsi %xindex_16, %cst_12 : tensor<2x1xi32> loc(#loc149) + %r0_mask = arith.cmpi slt, %r0_base_17, %cst_10 : tensor<1x128xi32> loc(#loc150) + %tmp0 = arith.addi %r0_base_17, %cst_9 : tensor<1x128xi32> loc(#loc151) + %tmp0_18 = arith.muli %x0, %cst_8 : tensor<2x1xi32> loc(#loc152) + %tmp0_19 = tt.broadcast %tmp0 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc153) + %tmp0_20 = tt.broadcast %tmp0_18 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc153) + %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<2x128xi32> loc(#loc153) + %tmp0_22 = arith.muli %x1, %cst_7 : tensor<2x1xi32> loc(#loc154) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc155) + %tmp0_24 = arith.addi %tmp0_21, %tmp0_23 : tensor<2x128xi32> loc(#loc155) + %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc156) + %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc156) + %tmp0_27 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc157) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc157) + %tmp0_29 = arith.extf %tmp0_28 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc158) + %tmp6 = tt.broadcast %r0_base_17 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc159) + %tmp6_30 = arith.addi %tmp6, %tmp0_20 : tensor<2x128xi32> loc(#loc159) + %tmp6_31 = arith.addi %tmp6_30, %tmp0_23 : tensor<2x128xi32> loc(#loc160) + %tmp6_32 = tt.addptr %tmp0_25, %tmp6_31 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc161) + %tmp6_33 = tt.load %tmp6_32, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc162) + %tmp6_34 = arith.extf %tmp6_33 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc163) + %tmp2 = arith.mulf %tmp0_29, %tmp0_29 : tensor<2x128xf32> loc(#loc164) + %tmp5 = arith.addf %tmp2, %cst_11 : tensor<2x128xf32> loc(#loc165) + %_tmp4 = arith.select %tmp0_27, %tmp5, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc166) + %tmp8 = arith.mulf %tmp6_34, %tmp6_34 : tensor<2x128xf32> loc(#loc167) + %tmp11 = arith.addf %tmp8, %cst_11 : tensor<2x128xf32> loc(#loc168) + %_tmp10 = arith.select %tmp0_27, %tmp11, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc169) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_98: f32 loc(callsite(#loc1 at #loc170)), %tmp4_99: f32 loc(callsite(#loc1 at #loc170))): + %tmp4_100 = arith.addf %tmp4_98, %tmp4_99 : f32 loc(#loc266) + tt.reduce.return %tmp4_100 : f32 loc(#loc262) + }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc262) + %tmp4_35 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc171) + %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_98: f32 loc(callsite(#loc1 at #loc172)), %tmp10_99: f32 loc(callsite(#loc1 at #loc172))): + %tmp10_100 = arith.addf %tmp10_98, %tmp10_99 : f32 loc(#loc267) + tt.reduce.return %tmp10_100 : f32 loc(#loc264) + }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc264) + %tmp10_36 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc173) + %r0_3 = arith.remsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc174) + %r0_4 = arith.divsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc175) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc176) + %tmp58_37 = tt.addptr %tmp58, %r0_base_17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc176) + %tmp58_38 = tt.load %tmp58_37, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc177) + %tmp58_39 = arith.extf %tmp58_38 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc178) + %tmp63 = arith.muli %x1, %cst_8 : tensor<2x1xi32> loc(#loc179) + %tmp63_40 = tt.broadcast %tmp63 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc180) + %tmp63_41 = arith.addi %tmp6, %tmp63_40 : tensor<2x128xi32> loc(#loc180) + %tmp63_42 = tt.splat %in_ptr2 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc181) + %tmp63_43 = tt.addptr %tmp63_42, %tmp63_41 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc181) + %tmp63_44 = tt.load %tmp63_43, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc182) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc183) + %tmp66_45 = tt.addptr %tmp66, %tmp63_41 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc183) + %tmp66_46 = tt.load %tmp66_45, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc184) + %tmp96 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_first : tensor<2x128x!tt.ptr> loc(#loc185) + %tmp96_47 = arith.extf %tmp96 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc186) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc187) + %tmp102_48 = tt.addptr %tmp102, %r0_base_17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc187) + %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc188) + %tmp102_50 = arith.extf %tmp102_49 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc189) + %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc190) + %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc190) + %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x128xi32> loc(#loc191) + %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x128xi32> loc(#loc192) + %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc193) + %tmp17_54 = arith.addi %tmp17_53, %tmp0_20 : tensor<2x128xi32> loc(#loc193) + %tmp17_55 = arith.addi %tmp17_54, %tmp0_23 : tensor<2x128xi32> loc(#loc194) + %tmp17_56 = tt.addptr %tmp0_25, %tmp17_55 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc195) + %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x128xi1> loc(#loc196) + %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc197) + %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc197) + %tmp17_60 = arith.extf %tmp17_59 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc198) + %tmp20 = arith.divf %tmp10_36, %cst_3 : tensor<2x1xf32> loc(#loc199) + %tmp22 = arith.addf %tmp20, %cst_2 : tensor<2x1xf32> loc(#loc200) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc201) + %tmp24 = tt.broadcast %tmp23 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc202) + %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<2x128xf32> loc(#loc202) + %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc203) + %tmp25_62 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr> -> tensor<2x128x!tt.ptr> loc(#loc203) + %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc204) + %tmp25_64 = arith.extf %tmp25_63 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc205) + %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<2x128xf32> loc(#loc206) + %tmp29 = arith.subf %cst_11, %tmp27 : tensor<2x128xf32> loc(#loc207) + %tmp31 = tt.broadcast %tmp16_51 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc208) + %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc208) + %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc209) + %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc210) + %tmp35_66 = arith.addi %tmp35, %tmp0_20 : tensor<2x128xi32> loc(#loc210) + %tmp35_67 = arith.addi %tmp35_66, %tmp0_23 : tensor<2x128xi32> loc(#loc211) + %tmp35_68 = tt.addptr %tmp0_25, %tmp35_67 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc212) + %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x128xi1> loc(#loc213) + %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc214) + %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc214) + %tmp35_72 = arith.extf %tmp35_71 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc215) + %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<2x128xf32> loc(#loc216) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc217) + %tmp43_73 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr> -> tensor<2x128x!tt.ptr> loc(#loc217) + %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc218) + %tmp43_75 = arith.extf %tmp43_74 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc219) + %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<2x128xf32> loc(#loc220) + %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc221) + %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc221) + %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc222) + %tmp57 = arith.mulf %tmp6_34, %tmp24 : tensor<2x128xf32> loc(#loc223) + %tmp60 = tt.broadcast %tmp58_39 : tensor<1x128xf32> -> tensor<2x128xf32> loc(#loc224) + %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<2x128xf32> loc(#loc224) + %tmp64 = arith.mulf %tmp60_77, %tmp63_44 : tensor<2x128xf32> loc(#loc225) + %tmp67 = arith.mulf %tmp49, %tmp66_46 : tensor<2x128xf32> loc(#loc226) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<2x128xf32> loc(#loc227) + %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x128xi32> loc(#loc228) + %tmp70_78 = tt.broadcast %tmp70 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc229) + %tmp70_79 = arith.addi %tmp70_78, %tmp0_20 : tensor<2x128xi32> loc(#loc229) + %tmp70_80 = arith.addi %tmp70_79, %tmp0_23 : tensor<2x128xi32> loc(#loc230) + %tmp70_81 = tt.addptr %tmp0_25, %tmp70_80 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc231) + %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc232) + %tmp70_83 = arith.extf %tmp70_82 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc233) + %tmp72 = arith.divf %tmp4_35, %cst_3 : tensor<2x1xf32> loc(#loc234) + %tmp73 = arith.addf %tmp72, %cst_2 : tensor<2x1xf32> loc(#loc235) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc236) + %tmp75 = tt.broadcast %tmp74 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc237) + %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<2x128xf32> loc(#loc237) + %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc238) + %tmp76_85 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr> -> tensor<2x128x!tt.ptr> loc(#loc238) + %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc239) + %tmp76_87 = arith.extf %tmp76_86 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc240) + %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<2x128xf32> loc(#loc241) + %tmp80 = arith.subf %cst_11, %tmp78 : tensor<2x128xf32> loc(#loc242) + %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc243) + %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x128xi32> loc(#loc244) + %tmp83_88 = tt.broadcast %tmp83 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc245) + %tmp83_89 = arith.addi %tmp83_88, %tmp0_20 : tensor<2x128xi32> loc(#loc245) + %tmp83_90 = arith.addi %tmp83_89, %tmp0_23 : tensor<2x128xi32> loc(#loc246) + %tmp83_91 = tt.addptr %tmp0_25, %tmp83_90 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc247) + %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc248) + %tmp83_93 = arith.extf %tmp83_92 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc249) + %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<2x128xf32> loc(#loc250) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc251) + %tmp89_94 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr> -> tensor<2x128x!tt.ptr> loc(#loc251) + %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr> loc(#loc252) + %tmp89_96 = arith.extf %tmp89_95 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc253) + %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<2x128xf32> loc(#loc254) + %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc255) + %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc256) + %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<2x128xf32> loc(#loc257) + %tmp104 = tt.broadcast %tmp102_50 : tensor<1x128xf32> -> tensor<2x128xf32> loc(#loc258) + %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<2x128xf32> loc(#loc258) + %tmp107 = arith.mulf %tmp104_97, %tmp63_44 : tensor<2x128xf32> loc(#loc259) + %tmp109 = arith.mulf %tmp95, %tmp66_46 : tensor<2x128xf32> loc(#loc260) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<2x128xf32> loc(#loc261) + %0 = arith.muli %xindex_16, %cst_8 : tensor<2x1xi32> loc(#loc125) + %1 = tt.broadcast %0 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc126) + %2 = arith.addi %tmp6, %1 : tensor<2x128xi32> loc(#loc126) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc127) + %4 = tt.addptr %3, %2 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc127) + %5 = arith.truncf %tmp68 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc128) + tt.store %4, %5, %tmp0_27 : tensor<2x128x!tt.ptr> loc(#loc128) + %6 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc129) + %7 = tt.addptr %6, %2 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc129) + %8 = arith.truncf %tmp110 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc130) + tt.store %7, %8, %tmp0_27 : tensor<2x128x!tt.ptr> loc(#loc130) + tt.return loc(#loc131) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc141 = loc("xoffset"(#loc2)) +#loc142 = loc("xoffset"(#loc3)) +#loc143 = loc("xindex"(#loc4)) +#loc144 = loc("xindex"(#loc5)) +#loc145 = loc("xindex"(#loc6)) +#loc146 = loc("r0_base"(#loc7)) +#loc147 = loc("r0_base"(#loc8)) +#loc148 = loc("x0"(#loc9)) +#loc149 = loc("x1"(#loc10)) +#loc150 = loc("r0_mask"(#loc11)) +#loc151 = loc("tmp0"(#loc12)) +#loc152 = loc("tmp0"(#loc13)) +#loc153 = loc("tmp0"(#loc14)) +#loc154 = loc("tmp0"(#loc15)) +#loc155 = loc("tmp0"(#loc16)) +#loc156 = loc("tmp0"(#loc17)) +#loc157 = loc("tmp0"(#loc18)) +#loc158 = loc("tmp0"(#loc19)) +#loc159 = loc("tmp6"(#loc20)) +#loc160 = loc("tmp6"(#loc21)) +#loc161 = loc("tmp6"(#loc22)) +#loc162 = loc("tmp6"(#loc23)) +#loc163 = loc("tmp6"(#loc24)) +#loc164 = loc("tmp2"(#loc25)) +#loc165 = loc("tmp5"(#loc26)) +#loc166 = loc("_tmp4"(#loc27)) +#loc167 = loc("tmp8"(#loc28)) +#loc168 = loc("tmp11"(#loc29)) +#loc169 = loc("_tmp10"(#loc30)) +#loc171 = loc("tmp4"(#loc34)) +#loc173 = loc("tmp10"(#loc36)) +#loc174 = loc("r0_3"(#loc37)) +#loc175 = loc("r0_4"(#loc38)) +#loc176 = loc("tmp58"(#loc39)) +#loc177 = loc("tmp58"(#loc40)) +#loc178 = loc("tmp58"(#loc41)) +#loc179 = loc("tmp63"(#loc42)) +#loc180 = loc("tmp63"(#loc43)) +#loc181 = loc("tmp63"(#loc44)) +#loc182 = loc("tmp63"(#loc45)) +#loc183 = loc("tmp66"(#loc46)) +#loc184 = loc("tmp66"(#loc47)) +#loc185 = loc("tmp96"(#loc48)) +#loc186 = loc("tmp96"(#loc49)) +#loc187 = loc("tmp102"(#loc50)) +#loc188 = loc("tmp102"(#loc51)) +#loc189 = loc("tmp102"(#loc52)) +#loc190 = loc("tmp16"(#loc53)) +#loc191 = loc("tmp17"(#loc54)) +#loc192 = loc("tmp17"(#loc55)) +#loc193 = loc("tmp17"(#loc56)) +#loc194 = loc("tmp17"(#loc57)) +#loc195 = loc("tmp17"(#loc58)) +#loc196 = loc("tmp17"(#loc59)) +#loc197 = loc("tmp17"(#loc60)) +#loc198 = loc("tmp17"(#loc61)) +#loc199 = loc("tmp20"(#loc62)) +#loc200 = loc("tmp22"(#loc63)) +#loc201 = loc("tmp23"(#loc64)) +#loc202 = loc("tmp24"(#loc65)) +#loc203 = loc("tmp25"(#loc66)) +#loc204 = loc("tmp25"(#loc67)) +#loc205 = loc("tmp25"(#loc68)) +#loc206 = loc("tmp27"(#loc69)) +#loc207 = loc("tmp29"(#loc70)) +#loc208 = loc("tmp31"(#loc71)) +#loc209 = loc("tmp32"(#loc72)) +#loc210 = loc("tmp35"(#loc73)) +#loc211 = loc("tmp35"(#loc74)) +#loc212 = loc("tmp35"(#loc75)) +#loc213 = loc("tmp35"(#loc76)) +#loc214 = loc("tmp35"(#loc77)) +#loc215 = loc("tmp35"(#loc78)) +#loc216 = loc("tmp42"(#loc79)) +#loc217 = loc("tmp43"(#loc80)) +#loc218 = loc("tmp43"(#loc81)) +#loc219 = loc("tmp43"(#loc82)) +#loc220 = loc("tmp45"(#loc83)) +#loc221 = loc("tmp48"(#loc84)) +#loc222 = loc("tmp49"(#loc85)) +#loc223 = loc("tmp57"(#loc86)) +#loc224 = loc("tmp60"(#loc87)) +#loc225 = loc("tmp64"(#loc88)) +#loc226 = loc("tmp67"(#loc89)) +#loc227 = loc("tmp68"(#loc90)) +#loc228 = loc("tmp70"(#loc91)) +#loc229 = loc("tmp70"(#loc92)) +#loc230 = loc("tmp70"(#loc93)) +#loc231 = loc("tmp70"(#loc94)) +#loc232 = loc("tmp70"(#loc95)) +#loc233 = loc("tmp70"(#loc96)) +#loc234 = loc("tmp72"(#loc97)) +#loc235 = loc("tmp73"(#loc98)) +#loc236 = loc("tmp74"(#loc99)) +#loc237 = loc("tmp75"(#loc100)) +#loc238 = loc("tmp76"(#loc101)) +#loc239 = loc("tmp76"(#loc102)) +#loc240 = loc("tmp76"(#loc103)) +#loc241 = loc("tmp78"(#loc104)) +#loc242 = loc("tmp80"(#loc105)) +#loc243 = loc("tmp82"(#loc106)) +#loc244 = loc("tmp83"(#loc107)) +#loc245 = loc("tmp83"(#loc108)) +#loc246 = loc("tmp83"(#loc109)) +#loc247 = loc("tmp83"(#loc110)) +#loc248 = loc("tmp83"(#loc111)) +#loc249 = loc("tmp83"(#loc112)) +#loc250 = loc("tmp88"(#loc113)) +#loc251 = loc("tmp89"(#loc114)) +#loc252 = loc("tmp89"(#loc115)) +#loc253 = loc("tmp89"(#loc116)) +#loc254 = loc("tmp91"(#loc117)) +#loc255 = loc("tmp94"(#loc118)) +#loc256 = loc("tmp95"(#loc119)) +#loc257 = loc("tmp101"(#loc120)) +#loc258 = loc("tmp104"(#loc121)) +#loc259 = loc("tmp107"(#loc122)) +#loc260 = loc("tmp109"(#loc123)) +#loc261 = loc("tmp110"(#loc124)) +#loc262 = loc(callsite(#loc31 at #loc170)) +#loc264 = loc(callsite(#loc31 at #loc172)) +#loc266 = loc(callsite(#loc33 at #loc262)) +#loc267 = loc(callsite(#loc33 at #loc264)) diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6384c258fc10549d2cc1b20c3800727ad07d4c8c --- /dev/null +++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_cat_mul_silu_split_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.source", "triton_poi_fused_cat_mul_silu_split_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir", "triton_poi_fused_cat_mul_silu_split_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir", "triton_poi_fused_cat_mul_silu_split_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.llir", "triton_poi_fused_cat_mul_silu_split_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx", "triton_poi_fused_cat_mul_silu_split_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin", "triton_poi_fused_cat_mul_silu_split_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.json"}} \ No newline at end of file diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..1891ade5ad01b38b9446b11bd4f77b5bd72a7432 Binary files /dev/null and b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin differ diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.json b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fc26b795ff4db5e20eb6bac332a4e6a2b9182a5c --- /dev/null +++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.json @@ -0,0 +1 @@ +{"hash": "05bb912de2e2021470855bb0e701ec375c8ca31df7283c3624079c6a67c9578d", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_cat_mul_silu_split_view_0"} \ No newline at end of file diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.llir b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..278b73fe5427df4c4bf2ea510e9fa5f944c90135 --- /dev/null +++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.llir @@ -0,0 +1,214 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_cat_mul_silu_split_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 10, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = shl nuw nsw i32 %9, 3, !dbg !9 + %11 = and i32 %10, 1016, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = sdiv i32 %12, 16384, !dbg !11 + %14 = mul i32 %13, 16384, !dbg !12 + %.decomposed = sub i32 %12, %14, !dbg !12 + %15 = icmp slt i32 %.decomposed, 4096, !dbg !13 + %16 = shl nsw i32 %13, 12, !dbg !14 + %17 = add nsw i32 %16, %.decomposed, !dbg !15 + %18 = sext i32 %17 to i64, !dbg !16 + %19 = getelementptr bfloat, ptr addrspace(1) %0, i64 %18, !dbg !16 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !17 + %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %20, i1 %15) #3, !dbg !17 + %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !17 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !17 + %24 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !17 + %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !17 + %26 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !17 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !17 + %28 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !17 + %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !17 + %30 = icmp sgt i32 %.decomposed, 4095, !dbg !18 + %31 = mul i32 %13, 36864, !dbg !19 + %32 = add nsw i32 %.decomposed, -4096, !dbg !20 + %33 = add i32 %31, %32, !dbg !21 + %34 = sext i32 %33 to i64, !dbg !22 + %35 = getelementptr bfloat, ptr addrspace(1) %1, i64 %34, !dbg !22 + %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !23 + %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %35, i64 %36, i1 %30) #3, !dbg !23 + %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !23 + %39 = bitcast i32 %38 to <2 x bfloat>, !dbg !23 + %40 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !23 + %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !23 + %42 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !23 + %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !23 + %44 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !23 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !23 + %46 = add i32 %33, 12288, !dbg !24 + %47 = sext i32 %46 to i64, !dbg !25 + %48 = getelementptr bfloat, ptr addrspace(1) %1, i64 %47, !dbg !25 + %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !26 + %50 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %48, i64 %49, i1 %30) #3, !dbg !26 + %51 = extractvalue { i32, i32, i32, i32 } %50, 0, !dbg !26 + %52 = bitcast i32 %51 to <2 x bfloat>, !dbg !26 + %53 = extractvalue { i32, i32, i32, i32 } %50, 1, !dbg !26 + %54 = bitcast i32 %53 to <2 x bfloat>, !dbg !26 + %55 = extractvalue { i32, i32, i32, i32 } %50, 2, !dbg !26 + %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !26 + %57 = extractvalue { i32, i32, i32, i32 } %50, 3, !dbg !26 + %58 = bitcast i32 %57 to <2 x bfloat>, !dbg !26 + %59 = sext i32 %12 to i64, !dbg !27 + %60 = getelementptr bfloat, ptr addrspace(1) %2, i64 %59, !dbg !27 + %61 = fpext <2 x bfloat> %39 to <2 x float>, !dbg !28 + %62 = extractelement <2 x float> %61, i64 0, !dbg !29 + %63 = fsub float 0.000000e+00, %62, !dbg !29 + %64 = extractelement <2 x float> %61, i64 1, !dbg !29 + %65 = fsub float 0.000000e+00, %64, !dbg !29 + %66 = fmul float %63, 0x3FF7154760000000, !dbg !34 + %67 = tail call float @llvm.nvvm.ex2.approx.f(float %66), !dbg !34 + %68 = fmul float %65, 0x3FF7154760000000, !dbg !34 + %69 = tail call float @llvm.nvvm.ex2.approx.f(float %68), !dbg !34 + %70 = fadd float %67, 1.000000e+00, !dbg !35 + %71 = fadd float %69, 1.000000e+00, !dbg !35 + %72 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %70), !dbg !36 + %73 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %71), !dbg !36 + %74 = insertelement <2 x float> poison, float %72, i64 0, !dbg !37 + %75 = insertelement <2 x float> %74, float %73, i64 1, !dbg !37 + %76 = fmul <2 x float> %75, %61, !dbg !37 + %77 = fpext <2 x bfloat> %52 to <2 x float>, !dbg !38 + %78 = fmul <2 x float> %76, %77, !dbg !39 + %79 = fptrunc <2 x float> %78 to <2 x bfloat>, !dbg !40 + %80 = insertelement <2 x i1> poison, i1 %15, i64 0, !dbg !41 + %81 = shufflevector <2 x i1> %80, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !41 + %82 = select <2 x i1> %81, <2 x bfloat> %23, <2 x bfloat> %79, !dbg !41 + %83 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !28 + %84 = extractelement <2 x float> %83, i64 0, !dbg !29 + %85 = fsub float 0.000000e+00, %84, !dbg !29 + %86 = extractelement <2 x float> %83, i64 1, !dbg !29 + %87 = fsub float 0.000000e+00, %86, !dbg !29 + %88 = fmul float %85, 0x3FF7154760000000, !dbg !34 + %89 = tail call float @llvm.nvvm.ex2.approx.f(float %88), !dbg !34 + %90 = fmul float %87, 0x3FF7154760000000, !dbg !34 + %91 = tail call float @llvm.nvvm.ex2.approx.f(float %90), !dbg !34 + %92 = fadd float %89, 1.000000e+00, !dbg !35 + %93 = fadd float %91, 1.000000e+00, !dbg !35 + %94 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %92), !dbg !36 + %95 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %93), !dbg !36 + %96 = insertelement <2 x float> poison, float %94, i64 0, !dbg !37 + %97 = insertelement <2 x float> %96, float %95, i64 1, !dbg !37 + %98 = fmul <2 x float> %97, %83, !dbg !37 + %99 = fpext <2 x bfloat> %54 to <2 x float>, !dbg !38 + %100 = fmul <2 x float> %98, %99, !dbg !39 + %101 = fptrunc <2 x float> %100 to <2 x bfloat>, !dbg !40 + %102 = select <2 x i1> %81, <2 x bfloat> %25, <2 x bfloat> %101, !dbg !41 + %103 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !28 + %104 = extractelement <2 x float> %103, i64 0, !dbg !29 + %105 = fsub float 0.000000e+00, %104, !dbg !29 + %106 = extractelement <2 x float> %103, i64 1, !dbg !29 + %107 = fsub float 0.000000e+00, %106, !dbg !29 + %108 = fmul float %105, 0x3FF7154760000000, !dbg !34 + %109 = tail call float @llvm.nvvm.ex2.approx.f(float %108), !dbg !34 + %110 = fmul float %107, 0x3FF7154760000000, !dbg !34 + %111 = tail call float @llvm.nvvm.ex2.approx.f(float %110), !dbg !34 + %112 = fadd float %109, 1.000000e+00, !dbg !35 + %113 = fadd float %111, 1.000000e+00, !dbg !35 + %114 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %112), !dbg !36 + %115 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %113), !dbg !36 + %116 = insertelement <2 x float> poison, float %114, i64 0, !dbg !37 + %117 = insertelement <2 x float> %116, float %115, i64 1, !dbg !37 + %118 = fmul <2 x float> %117, %103, !dbg !37 + %119 = fpext <2 x bfloat> %56 to <2 x float>, !dbg !38 + %120 = fmul <2 x float> %118, %119, !dbg !39 + %121 = fptrunc <2 x float> %120 to <2 x bfloat>, !dbg !40 + %122 = select <2 x i1> %81, <2 x bfloat> %27, <2 x bfloat> %121, !dbg !41 + %123 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !28 + %124 = extractelement <2 x float> %123, i64 0, !dbg !29 + %125 = fsub float 0.000000e+00, %124, !dbg !29 + %126 = extractelement <2 x float> %123, i64 1, !dbg !29 + %127 = fsub float 0.000000e+00, %126, !dbg !29 + %128 = fmul float %125, 0x3FF7154760000000, !dbg !34 + %129 = tail call float @llvm.nvvm.ex2.approx.f(float %128), !dbg !34 + %130 = fmul float %127, 0x3FF7154760000000, !dbg !34 + %131 = tail call float @llvm.nvvm.ex2.approx.f(float %130), !dbg !34 + %132 = fadd float %129, 1.000000e+00, !dbg !35 + %133 = fadd float %131, 1.000000e+00, !dbg !35 + %134 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %132), !dbg !36 + %135 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %133), !dbg !36 + %136 = insertelement <2 x float> poison, float %134, i64 0, !dbg !37 + %137 = insertelement <2 x float> %136, float %135, i64 1, !dbg !37 + %138 = fmul <2 x float> %137, %123, !dbg !37 + %139 = fpext <2 x bfloat> %58 to <2 x float>, !dbg !38 + %140 = fmul <2 x float> %138, %139, !dbg !39 + %141 = fptrunc <2 x float> %140 to <2 x bfloat>, !dbg !40 + %142 = select <2 x i1> %81, <2 x bfloat> %29, <2 x bfloat> %141, !dbg !41 + %143 = bitcast <2 x bfloat> %82 to i32, !dbg !40 + %144 = bitcast <2 x bfloat> %102 to i32, !dbg !40 + %145 = bitcast <2 x bfloat> %122 to i32, !dbg !40 + %146 = bitcast <2 x bfloat> %142 to i32, !dbg !40 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %143, i32 %144, i32 %145, i32 %146, ptr addrspace(1) %60) #3, !dbg !40 + ret void, !dbg !42 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_cat_mul_silu_split_view_0", linkageName: "triton_poi_fused_cat_mul_silu_split_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 19, scope: !4) +!12 = !DILocation(line: 23, column: 19, scope: !4) +!13 = !DILocation(line: 30, column: 18, scope: !4) +!14 = !DILocation(line: 31, column: 35, scope: !4) +!15 = !DILocation(line: 31, column: 41, scope: !4) +!16 = !DILocation(line: 31, column: 30, scope: !4) +!17 = !DILocation(line: 31, column: 47, scope: !4) +!18 = !DILocation(line: 32, column: 19, scope: !4) +!19 = !DILocation(line: 35, column: 36, scope: !4) +!20 = !DILocation(line: 35, column: 52, scope: !4) +!21 = !DILocation(line: 35, column: 42, scope: !4) +!22 = !DILocation(line: 35, column: 30, scope: !4) +!23 = !DILocation(line: 35, column: 58, scope: !4) +!24 = !DILocation(line: 40, column: 51, scope: !4) +!25 = !DILocation(line: 40, column: 31, scope: !4) +!26 = !DILocation(line: 40, column: 67, scope: !4) +!27 = !DILocation(line: 45, column: 25, scope: !4) +!28 = !DILocation(line: 35, column: 108, scope: !4) +!29 = !DILocation(line: 50, column: 30, scope: !30, inlinedAt: !32) +!30 = distinct !DILexicalBlockFile(scope: !4, file: !31, discriminator: 0) +!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!32 = !DILocation(line: 37, column: 23, scope: !33) +!33 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!34 = !DILocation(line: 50, column: 29, scope: !30, inlinedAt: !32) +!35 = !DILocation(line: 50, column: 20, scope: !30, inlinedAt: !32) +!36 = !DILocation(line: 50, column: 16, scope: !30, inlinedAt: !32) +!37 = !DILocation(line: 38, column: 20, scope: !4) +!38 = !DILocation(line: 40, column: 117, scope: !4) +!39 = !DILocation(line: 41, column: 20, scope: !4) +!40 = !DILocation(line: 45, column: 37, scope: !4) +!41 = !DILocation(line: 44, column: 33, scope: !4) +!42 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..cfa287e4d55b3317c15267f3e39fa70f6ad45d11 --- /dev/null +++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx @@ -0,0 +1,613 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_cat_mul_silu_split_view_0 // -- Begin function triton_poi_fused_cat_mul_silu_split_view_0 + // @triton_poi_fused_cat_mul_silu_split_view_0 +.visible .entry triton_poi_fused_cat_mul_silu_split_view_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_2, + .param .u32 triton_poi_fused_cat_mul_silu_split_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_5 +) +.reqntid 128 +{ + .reg .pred %p<3>; + .reg .b16 %rs<41>; + .reg .b32 %r<109>; + .reg .b64 %rd<11>; + .loc 1 18 0 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_poi_fused_cat_mul_silu_split_view_0_param_0]; + ld.param.b64 %rd9, [triton_poi_fused_cat_mul_silu_split_view_0_param_1]; +$L__tmp0: + .loc 1 20 28 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:20:28 + mov.u32 %r18, %ctaid.x; + .loc 1 20 33 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:20:33 + shl.b32 %r19, %r18, 10; + ld.param.b64 %rd10, [triton_poi_fused_cat_mul_silu_split_view_0_param_2]; + .loc 1 21 36 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:21:36 + mov.u32 %r20, %tid.x; + shl.b32 %r21, %r20, 3; + and.b32 %r22, %r21, 1016; + .loc 1 21 23 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:21:23 + or.b32 %r23, %r22, %r19; + .loc 1 24 19 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:24:19 + bfe.s32 %r24, %r18, 21, 1; + shr.u32 %r25, %r24, 18; + add.s32 %r26, %r23, %r25; + shr.s32 %r27, %r26, 14; + .loc 1 23 19 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:23:19 + and.b32 %r28, %r26, -16384; + sub.s32 %r29, %r23, %r28; + .loc 1 30 18 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:30:18 + setp.lt.s32 %p1, %r29, 4096; + .loc 1 31 35 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:35 + shl.b32 %r30, %r27, 12; + .loc 1 31 41 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:41 + add.s32 %r31, %r30, %r29; + .loc 1 31 30 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:30 + mad.wide.s32 %rd1, %r31, 2, %rd8; + .loc 1 31 47 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:47 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + .loc 1 32 19 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:32:19 + setp.gt.s32 %p2, %r29, 4095; + .loc 1 35 52 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:52 + mad.lo.s32 %r32, %r27, 36864, %r29; + .loc 1 35 42 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:42 + add.s32 %r33, %r32, -4096; + .loc 1 35 30 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:30 + mad.wide.s32 %rd3, %r33, 2, %rd9; + .loc 1 35 58 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:58 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 40 51 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:51 + add.s32 %r34, %r32, 8192; + .loc 1 40 31 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:31 + mad.wide.s32 %rd5, %r34, 2, %rd9; + .loc 1 40 67 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:67 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + mov.u32 %r13, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 45 25 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:25 + mad.wide.s32 %rd7, %r23, 2, %rd10; + .loc 1 35 108 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:108 + mov.b32 {%rs1, %rs2}, %r6; + cvt.f32.bf16 %r35, %rs2; + cvt.f32.bf16 %r36, %rs1; + mov.b32 %r37, 0f00000000; +$L__tmp1: + .loc 2 50 30 // standard.py:50:30 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + sub.f32 %r38, %r37, %r36; + sub.f32 %r39, %r37, %r35; + .loc 2 50 29 // standard.py:50:29 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + mul.f32 %r40, %r38, 0f3FB8AA3B; + ex2.approx.f32 %r41, %r40; + mul.f32 %r42, %r39, 0f3FB8AA3B; + ex2.approx.f32 %r43, %r42; + .loc 2 50 20 // standard.py:50:20 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + add.f32 %r44, %r41, 0f3F800000; + add.f32 %r45, %r43, 0f3F800000; + mov.b32 %r46, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + div.full.f32 %r47, %r46, %r44; + div.full.f32 %r48, %r46, %r45; +$L__tmp2: + .loc 1 38 20 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:38:20 + mul.f32 %r49, %r47, %r36; + mul.f32 %r50, %r48, %r35; + .loc 1 40 117 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:117 + mov.b32 {%rs3, %rs4}, %r10; + cvt.f32.bf16 %r51, %rs3; + cvt.f32.bf16 %r52, %rs4; + .loc 1 41 20 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:41:20 + mul.f32 %r53, %r50, %r52; + mul.f32 %r54, %r49, %r51; + .loc 1 45 37 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37 + cvt.rn.bf16.f32 %rs5, %r54; + cvt.rn.bf16.f32 %rs6, %r53; + .loc 1 44 33 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:44:33 + mov.b32 {%rs7, %rs8}, %r1; + selp.b16 %rs9, %rs8, %rs6, %p1; + selp.b16 %rs10, %rs7, %rs5, %p1; + mov.b32 %r14, {%rs10, %rs9}; + .loc 1 35 108 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:108 + mov.b32 {%rs11, %rs12}, %r7; + cvt.f32.bf16 %r55, %rs12; + cvt.f32.bf16 %r56, %rs11; +$L__tmp3: + .loc 2 50 30 // standard.py:50:30 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + sub.f32 %r57, %r37, %r56; + sub.f32 %r58, %r37, %r55; + .loc 2 50 29 // standard.py:50:29 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + mul.f32 %r59, %r57, 0f3FB8AA3B; + ex2.approx.f32 %r60, %r59; + mul.f32 %r61, %r58, 0f3FB8AA3B; + ex2.approx.f32 %r62, %r61; + .loc 2 50 20 // standard.py:50:20 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + add.f32 %r63, %r60, 0f3F800000; + add.f32 %r64, %r62, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + div.full.f32 %r65, %r46, %r63; + div.full.f32 %r66, %r46, %r64; +$L__tmp4: + .loc 1 38 20 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:38:20 + mul.f32 %r67, %r65, %r56; + mul.f32 %r68, %r66, %r55; + .loc 1 40 117 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:117 + mov.b32 {%rs13, %rs14}, %r11; + cvt.f32.bf16 %r69, %rs13; + cvt.f32.bf16 %r70, %rs14; + .loc 1 41 20 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:41:20 + mul.f32 %r71, %r68, %r70; + mul.f32 %r72, %r67, %r69; + .loc 1 45 37 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37 + cvt.rn.bf16.f32 %rs15, %r72; + cvt.rn.bf16.f32 %rs16, %r71; + .loc 1 44 33 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:44:33 + mov.b32 {%rs17, %rs18}, %r2; + selp.b16 %rs19, %rs18, %rs16, %p1; + selp.b16 %rs20, %rs17, %rs15, %p1; + mov.b32 %r15, {%rs20, %rs19}; + .loc 1 35 108 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:108 + mov.b32 {%rs21, %rs22}, %r8; + cvt.f32.bf16 %r73, %rs22; + cvt.f32.bf16 %r74, %rs21; +$L__tmp5: + .loc 2 50 30 // standard.py:50:30 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + sub.f32 %r75, %r37, %r74; + sub.f32 %r76, %r37, %r73; + .loc 2 50 29 // standard.py:50:29 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + mul.f32 %r77, %r75, 0f3FB8AA3B; + ex2.approx.f32 %r78, %r77; + mul.f32 %r79, %r76, 0f3FB8AA3B; + ex2.approx.f32 %r80, %r79; + .loc 2 50 20 // standard.py:50:20 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + add.f32 %r81, %r78, 0f3F800000; + add.f32 %r82, %r80, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + div.full.f32 %r83, %r46, %r81; + div.full.f32 %r84, %r46, %r82; +$L__tmp6: + .loc 1 38 20 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:38:20 + mul.f32 %r85, %r83, %r74; + mul.f32 %r86, %r84, %r73; + .loc 1 40 117 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:117 + mov.b32 {%rs23, %rs24}, %r12; + cvt.f32.bf16 %r87, %rs23; + cvt.f32.bf16 %r88, %rs24; + .loc 1 41 20 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:41:20 + mul.f32 %r89, %r86, %r88; + mul.f32 %r90, %r85, %r87; + .loc 1 45 37 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37 + cvt.rn.bf16.f32 %rs25, %r90; + cvt.rn.bf16.f32 %rs26, %r89; + .loc 1 44 33 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:44:33 + mov.b32 {%rs27, %rs28}, %r3; + selp.b16 %rs29, %rs28, %rs26, %p1; + selp.b16 %rs30, %rs27, %rs25, %p1; + mov.b32 %r16, {%rs30, %rs29}; + .loc 1 35 108 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:108 + mov.b32 {%rs31, %rs32}, %r9; + cvt.f32.bf16 %r91, %rs32; + cvt.f32.bf16 %r92, %rs31; +$L__tmp7: + .loc 2 50 30 // standard.py:50:30 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + sub.f32 %r93, %r37, %r92; + sub.f32 %r94, %r37, %r91; + .loc 2 50 29 // standard.py:50:29 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + mul.f32 %r95, %r93, 0f3FB8AA3B; + ex2.approx.f32 %r96, %r95; + mul.f32 %r97, %r94, 0f3FB8AA3B; + ex2.approx.f32 %r98, %r97; + .loc 2 50 20 // standard.py:50:20 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + add.f32 %r99, %r96, 0f3F800000; + add.f32 %r100, %r98, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + div.full.f32 %r101, %r46, %r99; + div.full.f32 %r102, %r46, %r100; +$L__tmp8: + .loc 1 38 20 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:38:20 + mul.f32 %r103, %r101, %r92; + mul.f32 %r104, %r102, %r91; + .loc 1 40 117 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:117 + mov.b32 {%rs33, %rs34}, %r13; + cvt.f32.bf16 %r105, %rs33; + cvt.f32.bf16 %r106, %rs34; + .loc 1 41 20 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:41:20 + mul.f32 %r107, %r104, %r106; + mul.f32 %r108, %r103, %r105; + .loc 1 45 37 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37 + cvt.rn.bf16.f32 %rs35, %r108; + cvt.rn.bf16.f32 %rs36, %r107; + .loc 1 44 33 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:44:33 + mov.b32 {%rs37, %rs38}, %r4; + selp.b16 %rs39, %rs38, %rs36, %p1; + selp.b16 %rs40, %rs37, %rs35, %p1; + mov.b32 %r17, {%rs40, %rs39}; + .loc 1 45 37 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37 + // begin inline asm + st.global.v4.b32 [ %rd7 + 0 ], { %r14, %r15, %r16, %r17 }; + // end inline asm + .loc 1 45 4 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 316 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x135 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 51 +.b8 105 +.b8 109 +.b8 121 +.b8 102 +.b8 105 +.b8 98 +.b8 99 +.b8 113 +.b8 51 +.b8 122 +.b8 119 +.b8 114 +.b8 99 +.b8 53 +.b8 103 +.b8 118 +.b8 102 +.b8 115 +.b8 99 +.b8 118 +.b8 112 +.b8 115 +.b8 97 +.b8 120 +.b8 100 +.b8 122 +.b8 106 +.b8 105 +.b8 106 +.b8 121 +.b8 109 +.b8 114 +.b8 110 +.b8 116 +.b8 50 +.b8 108 +.b8 102 +.b8 97 +.b8 104 +.b8 116 +.b8 114 +.b8 106 +.b8 109 +.b8 114 +.b8 98 +.b8 116 +.b8 108 +.b8 109 +.b8 104 +.b8 101 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 51 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 111 +.b8 105 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 116 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 105 +.b8 108 +.b8 117 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x111:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x126:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 23 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.source b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..5e32453a051384c371add5991fb7aba58655fac7 --- /dev/null +++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.source @@ -0,0 +1,212 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0) +#loc43 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("in_ptr1"(#loc)) +#loc52 = loc("out_ptr0"(#loc)) +#loc53 = loc("xnumel"(#loc)) +#loc93 = loc("x"(#loc43)) +module { + tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 37748736 : i32 loc(#loc54) + %xoffset = tt.get_program_id x : i32 loc(#loc55) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc56) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc56) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc56) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc57) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc58) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc58) + %xmask = arith.constant true loc(#loc59) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc59) + %x0 = arith.constant 16384 : i32 loc(#loc60) + %x0_7 = arith.constant 16384 : i32 loc(#loc60) + %x0_8 = arith.constant dense<16384> : tensor<1024xi32> loc(#loc60) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc60) + %x1 = arith.constant 16384 : i32 loc(#loc61) + %x1_10 = arith.constant 16384 : i32 loc(#loc61) + %x1_11 = arith.constant dense<16384> : tensor<1024xi32> loc(#loc61) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc61) + %tmp1 = arith.constant 0 : i64 loc(#loc62) + %tmp1_13 = arith.constant dense<0> : tensor<1xi64> loc(#loc62) + %tmp2 = arith.extsi %x0_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc63) + %tmp2_14 = arith.constant dense<0> : tensor<1024xi64> loc(#loc63) + %tmp2_15 = arith.cmpi sge, %tmp2, %tmp2_14 : tensor<1024xi64> loc(#loc63) + %tmp3 = arith.constant 4096 : i64 loc(#loc64) + %tmp3_16 = arith.constant dense<4096> : tensor<1xi64> loc(#loc64) + %tmp4 = arith.extsi %x0_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc65) + %tmp4_17 = arith.constant dense<4096> : tensor<1024xi64> loc(#loc65) + %tmp4_18 = arith.cmpi slt, %tmp4, %tmp4_17 : tensor<1024xi64> loc(#loc65) + %tmp5 = arith.constant 4096 : i32 loc(#loc66) + %tmp5_19 = arith.constant 4096 : i32 loc(#loc66) + %tmp5_20 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc66) + %tmp5_21 = arith.muli %tmp5_20, %x1_12 : tensor<1024xi32> loc(#loc66) + %tmp5_22 = arith.addi %tmp5_21, %x0_9 : tensor<1024xi32> loc(#loc67) + %tmp5_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc68) + %tmp5_24 = tt.addptr %tmp5_23, %tmp5_22 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc68) + %tmp5_25 = arith.constant 0.000000e+00 : f32 loc(#loc69) + %tmp5_26 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc69) + %tmp5_27 = arith.truncf %tmp5_26 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc69) + %tmp5_28 = tt.load %tmp5_24, %tmp4_18, %tmp5_27 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc69) + %tmp5_29 = arith.extf %tmp5_28 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc70) + %tmp6 = arith.extsi %x0_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc71) + %tmp6_30 = arith.constant dense<4096> : tensor<1024xi64> loc(#loc71) + %tmp6_31 = arith.cmpi sge, %tmp6, %tmp6_30 : tensor<1024xi64> loc(#loc71) + %tmp7 = arith.constant 16384 : i64 loc(#loc72) + %tmp7_32 = arith.constant dense<16384> : tensor<1xi64> loc(#loc72) + %tmp8 = arith.extsi %x0_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc73) + %tmp8_33 = arith.constant dense<16384> : tensor<1024xi64> loc(#loc73) + %tmp8_34 = arith.cmpi slt, %tmp8, %tmp8_33 : tensor<1024xi64> loc(#loc73) + %tmp9 = arith.constant 36864 : i32 loc(#loc74) + %tmp9_35 = arith.constant 36864 : i32 loc(#loc74) + %tmp9_36 = arith.constant dense<36864> : tensor<1024xi32> loc(#loc74) + %tmp9_37 = arith.muli %tmp9_36, %x1_12 : tensor<1024xi32> loc(#loc74) + %tmp9_38 = arith.constant -4096 : i32 loc(#loc75) + %tmp9_39 = arith.constant -4096 : i32 loc(#loc75) + %tmp9_40 = arith.constant dense<-4096> : tensor<1024xi32> loc(#loc75) + %tmp9_41 = arith.addi %tmp9_40, %x0_9 : tensor<1024xi32> loc(#loc75) + %tmp9_42 = arith.addi %tmp9_37, %tmp9_41 : tensor<1024xi32> loc(#loc76) + %tmp9_43 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc77) + %tmp9_44 = tt.addptr %tmp9_43, %tmp9_42 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc77) + %tmp9_45 = arith.constant 0.000000e+00 : f32 loc(#loc78) + %tmp9_46 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc78) + %tmp9_47 = arith.truncf %tmp9_46 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc78) + %tmp9_48 = tt.load %tmp9_44, %tmp6_31, %tmp9_47 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc78) + %tmp9_49 = arith.extf %tmp9_48 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc79) + %tmp11 = tt.call @triton.language.standard.sigmoid__fp32S1024S__(%tmp9_49) : (tensor<1024xf32>) -> tensor<1024xf32> loc(#loc80) + %tmp12 = arith.mulf %tmp9_49, %tmp11 : tensor<1024xf32> loc(#loc81) + %tmp14 = arith.constant 36864 : i32 loc(#loc82) + %tmp14_50 = arith.constant 36864 : i32 loc(#loc82) + %tmp14_51 = arith.constant dense<36864> : tensor<1024xi32> loc(#loc82) + %tmp14_52 = arith.muli %tmp14_51, %x1_12 : tensor<1024xi32> loc(#loc82) + %tmp14_53 = arith.constant 12288 : i32 loc(#loc83) + %tmp14_54 = arith.constant 12288 : i32 loc(#loc83) + %tmp14_55 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc83) + %tmp14_56 = arith.addi %tmp14_55, %tmp14_52 : tensor<1024xi32> loc(#loc83) + %tmp14_57 = arith.constant -4096 : i32 loc(#loc84) + %tmp14_58 = arith.constant -4096 : i32 loc(#loc84) + %tmp14_59 = arith.constant dense<-4096> : tensor<1024xi32> loc(#loc84) + %tmp14_60 = arith.addi %tmp14_59, %x0_9 : tensor<1024xi32> loc(#loc84) + %tmp14_61 = arith.addi %tmp14_56, %tmp14_60 : tensor<1024xi32> loc(#loc85) + %tmp14_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc86) + %tmp14_63 = tt.addptr %tmp14_62, %tmp14_61 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc86) + %tmp14_64 = arith.constant 0.000000e+00 : f32 loc(#loc87) + %tmp14_65 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc87) + %tmp14_66 = arith.truncf %tmp14_65 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc87) + %tmp14_67 = tt.load %tmp14_63, %tmp6_31, %tmp14_66 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc87) + %tmp14_68 = arith.extf %tmp14_67 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc88) + %tmp15 = arith.mulf %tmp12, %tmp14_68 : tensor<1024xf32> loc(#loc89) + %tmp16 = arith.constant 0.000000e+00 : f32 loc(#loc90) + %tmp16_69 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc90) + %tmp17 = arith.select %tmp6_31, %tmp15, %tmp16_69 : tensor<1024xi1>, tensor<1024xf32> loc(#loc91) + %tmp18 = arith.select %tmp4_18, %tmp5_29, %tmp17 : tensor<1024xi1>, tensor<1024xf32> loc(#loc92) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc40) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc40) + %2 = arith.truncf %tmp18 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc41) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc41) + tt.return loc(#loc42) + } loc(#loc) + tt.func private @triton.language.standard.sigmoid__fp32S1024S__(%x: tensor<1024xf32> loc("x"(#loc43))) -> tensor<1024xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc44) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc44) + %0 = arith.subf %cst_0, %x : tensor<1024xf32> loc(#loc44) + %1 = math.exp %0 : tensor<1024xf32> loc(#loc45) + %c1_i32 = arith.constant 1 : i32 loc(#loc46) + %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc46) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc46) + %2 = arith.addf %cst_2, %1 : tensor<1024xf32> loc(#loc46) + %c1_i32_3 = arith.constant 1 : i32 loc(#loc47) + %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc47) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc47) + %3 = arith.divf %cst_5, %2 : tensor<1024xf32> loc(#loc47) + tt.return %3 : tensor<1024xf32> loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1024xf32> loc(#loc49) + tt.return %4 : tensor<1024xf32> loc(#loc49) + } loc(#loc43) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":27:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":29:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":33:31) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":34:18) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:45) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:61) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":42:38) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4) +#loc44 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc45 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc46 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc47 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc48 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11) +#loc49 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4) +#loc54 = loc("xnumel"(#loc1)) +#loc55 = loc("xoffset"(#loc2)) +#loc56 = loc("xoffset"(#loc3)) +#loc57 = loc("xindex"(#loc4)) +#loc58 = loc("xindex"(#loc5)) +#loc59 = loc("xmask"(#loc6)) +#loc60 = loc("x0"(#loc7)) +#loc61 = loc("x1"(#loc8)) +#loc62 = loc("tmp1"(#loc9)) +#loc63 = loc("tmp2"(#loc10)) +#loc64 = loc("tmp3"(#loc11)) +#loc65 = loc("tmp4"(#loc12)) +#loc66 = loc("tmp5"(#loc13)) +#loc67 = loc("tmp5"(#loc14)) +#loc68 = loc("tmp5"(#loc15)) +#loc69 = loc("tmp5"(#loc16)) +#loc70 = loc("tmp5"(#loc17)) +#loc71 = loc("tmp6"(#loc18)) +#loc72 = loc("tmp7"(#loc19)) +#loc73 = loc("tmp8"(#loc20)) +#loc74 = loc("tmp9"(#loc21)) +#loc75 = loc("tmp9"(#loc22)) +#loc76 = loc("tmp9"(#loc23)) +#loc77 = loc("tmp9"(#loc24)) +#loc78 = loc("tmp9"(#loc25)) +#loc79 = loc("tmp9"(#loc26)) +#loc80 = loc("tmp11"(#loc27)) +#loc81 = loc("tmp12"(#loc28)) +#loc82 = loc("tmp14"(#loc29)) +#loc83 = loc("tmp14"(#loc30)) +#loc84 = loc("tmp14"(#loc31)) +#loc85 = loc("tmp14"(#loc32)) +#loc86 = loc("tmp14"(#loc33)) +#loc87 = loc("tmp14"(#loc34)) +#loc88 = loc("tmp14"(#loc35)) +#loc89 = loc("tmp15"(#loc36)) +#loc90 = loc("tmp16"(#loc37)) +#loc91 = loc("tmp17"(#loc38)) +#loc92 = loc("tmp18"(#loc39)) diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..6adc45493dd4f0873add3cce8bf2b6c4fa884f88 --- /dev/null +++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir @@ -0,0 +1,131 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0) +#loc38 = loc("in_ptr0"(#loc)) +#loc39 = loc("in_ptr1"(#loc)) +#loc40 = loc("out_ptr0"(#loc)) +#loc41 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<12288> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<-4096> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<36864> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<4096> : tensor<1024xi64, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<16384> : tensor<1024xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc42) + %xoffset_8 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc43) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc44) + %xindex_9 = tt.splat %xoffset_8 : i32 -> tensor<1024xi32, #blocked> loc(#loc45) + %xindex_10 = arith.addi %xindex_9, %xindex : tensor<1024xi32, #blocked> loc(#loc45) + %x0 = arith.remsi %xindex_10, %cst_4 : tensor<1024xi32, #blocked> loc(#loc46) + %x1 = arith.divsi %xindex_10, %cst_4 : tensor<1024xi32, #blocked> loc(#loc47) + %tmp4 = arith.extsi %x0 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc48) + %tmp4_11 = arith.cmpi slt, %tmp4, %cst_3 : tensor<1024xi64, #blocked> loc(#loc48) + %tmp5 = arith.muli %x1, %cst_2 : tensor<1024xi32, #blocked> loc(#loc49) + %tmp5_12 = arith.addi %tmp5, %x0 : tensor<1024xi32, #blocked> loc(#loc50) + %tmp5_13 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc51) + %tmp5_14 = tt.addptr %tmp5_13, %tmp5_12 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc51) + %tmp5_15 = tt.load %tmp5_14, %tmp4_11, %cst_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc52) + %tmp5_16 = arith.extf %tmp5_15 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc53) + %tmp6 = arith.cmpi sge, %tmp4, %cst_3 : tensor<1024xi64, #blocked> loc(#loc54) + %tmp9 = arith.muli %x1, %cst_1 : tensor<1024xi32, #blocked> loc(#loc55) + %tmp9_17 = arith.addi %x0, %cst_0 : tensor<1024xi32, #blocked> loc(#loc56) + %tmp9_18 = arith.addi %tmp9, %tmp9_17 : tensor<1024xi32, #blocked> loc(#loc57) + %tmp9_19 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc58) + %tmp9_20 = tt.addptr %tmp9_19, %tmp9_18 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc58) + %tmp9_21 = tt.load %tmp9_20, %tmp6, %cst_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc59) + %tmp9_22 = arith.extf %tmp9_21 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc60) + %tmp11 = arith.subf %cst_7, %tmp9_22 : tensor<1024xf32, #blocked> loc(#loc71) + %tmp11_23 = math.exp %tmp11 : tensor<1024xf32, #blocked> loc(#loc72) + %tmp11_24 = arith.addf %tmp11_23, %cst_6 : tensor<1024xf32, #blocked> loc(#loc73) + %tmp11_25 = arith.divf %cst_6, %tmp11_24 : tensor<1024xf32, #blocked> loc(#loc74) + %tmp12 = arith.mulf %tmp9_22, %tmp11_25 : tensor<1024xf32, #blocked> loc(#loc62) + %tmp14 = arith.addi %tmp9, %cst : tensor<1024xi32, #blocked> loc(#loc63) + %tmp14_26 = arith.addi %tmp14, %tmp9_17 : tensor<1024xi32, #blocked> loc(#loc64) + %tmp14_27 = tt.addptr %tmp9_19, %tmp14_26 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc65) + %tmp14_28 = tt.load %tmp14_27, %tmp6, %cst_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc66) + %tmp14_29 = arith.extf %tmp14_28 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc67) + %tmp15 = arith.mulf %tmp12, %tmp14_29 : tensor<1024xf32, #blocked> loc(#loc68) + %tmp17 = arith.select %tmp6, %tmp15, %cst_7 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc69) + %tmp18 = arith.select %tmp4_11, %tmp5_16, %tmp17 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc70) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc35) + %1 = tt.addptr %0, %xindex_10 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc35) + %2 = arith.truncf %tmp18 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc36) + tt.store %1, %2 : tensor<1024x!tt.ptr, #blocked> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4) +#loc42 = loc("xoffset"(#loc2)) +#loc43 = loc("xoffset"(#loc3)) +#loc44 = loc("xindex"(#loc4)) +#loc45 = loc("xindex"(#loc5)) +#loc46 = loc("x0"(#loc6)) +#loc47 = loc("x1"(#loc7)) +#loc48 = loc("tmp4"(#loc8)) +#loc49 = loc("tmp5"(#loc9)) +#loc50 = loc("tmp5"(#loc10)) +#loc51 = loc("tmp5"(#loc11)) +#loc52 = loc("tmp5"(#loc12)) +#loc53 = loc("tmp5"(#loc13)) +#loc54 = loc("tmp6"(#loc14)) +#loc55 = loc("tmp9"(#loc15)) +#loc56 = loc("tmp9"(#loc16)) +#loc57 = loc("tmp9"(#loc17)) +#loc58 = loc("tmp9"(#loc18)) +#loc59 = loc("tmp9"(#loc19)) +#loc60 = loc("tmp9"(#loc20)) +#loc61 = loc("tmp11"(#loc22)) +#loc62 = loc("tmp12"(#loc26)) +#loc63 = loc("tmp14"(#loc27)) +#loc64 = loc("tmp14"(#loc28)) +#loc65 = loc("tmp14"(#loc29)) +#loc66 = loc("tmp14"(#loc30)) +#loc67 = loc("tmp14"(#loc31)) +#loc68 = loc("tmp15"(#loc32)) +#loc69 = loc("tmp17"(#loc33)) +#loc70 = loc("tmp18"(#loc34)) +#loc71 = loc(callsite(#loc21 at #loc61)) +#loc72 = loc(callsite(#loc23 at #loc61)) +#loc73 = loc(callsite(#loc24 at #loc61)) +#loc74 = loc(callsite(#loc25 at #loc61)) diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c67c3914290bff4ff21152033f06dee0c0668cda --- /dev/null +++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir @@ -0,0 +1,131 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0) +#loc38 = loc("in_ptr0"(#loc)) +#loc39 = loc("in_ptr1"(#loc)) +#loc40 = loc("out_ptr0"(#loc)) +#loc41 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp11 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc71) + %cst = arith.constant dense<0.000000e+00> : tensor<1024xbf16> loc(#loc1) + %tmp14 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc43) + %cst_0 = arith.constant dense<-4096> : tensor<1024xi32> loc(#loc1) + %cst_1 = arith.constant dense<36864> : tensor<1024xi32> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc1) + %tmp5 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc44) + %cst_3 = arith.constant dense<4096> : tensor<1024xi64> loc(#loc1) + %cst_4 = arith.constant dense<16384> : tensor<1024xi32> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc45) + %xoffset_5 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc46) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc47) + %xindex_6 = tt.splat %xoffset_5 : i32 -> tensor<1024xi32> loc(#loc48) + %xindex_7 = arith.addi %xindex_6, %xindex : tensor<1024xi32> loc(#loc48) + %x0 = arith.remsi %xindex_7, %cst_4 : tensor<1024xi32> loc(#loc49) + %x1 = arith.divsi %xindex_7, %cst_4 : tensor<1024xi32> loc(#loc50) + %tmp4 = arith.extsi %x0 : tensor<1024xi32> to tensor<1024xi64> loc(#loc51) + %tmp4_8 = arith.cmpi slt, %tmp4, %cst_3 : tensor<1024xi64> loc(#loc51) + %tmp5_9 = arith.muli %x1, %tmp5 : tensor<1024xi32> loc(#loc44) + %tmp5_10 = arith.addi %tmp5_9, %x0 : tensor<1024xi32> loc(#loc52) + %tmp5_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc53) + %tmp5_12 = tt.addptr %tmp5_11, %tmp5_10 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc53) + %tmp5_13 = tt.load %tmp5_12, %tmp4_8, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc54) + %tmp5_14 = arith.extf %tmp5_13 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc55) + %tmp6 = arith.cmpi sge, %tmp4, %cst_3 : tensor<1024xi64> loc(#loc56) + %tmp9 = arith.muli %x1, %cst_1 : tensor<1024xi32> loc(#loc57) + %tmp9_15 = arith.addi %x0, %cst_0 : tensor<1024xi32> loc(#loc58) + %tmp9_16 = arith.addi %tmp9, %tmp9_15 : tensor<1024xi32> loc(#loc59) + %tmp9_17 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc60) + %tmp9_18 = tt.addptr %tmp9_17, %tmp9_16 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc60) + %tmp9_19 = tt.load %tmp9_18, %tmp6, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc61) + %tmp9_20 = arith.extf %tmp9_19 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc62) + %tmp11_21 = arith.subf %cst_2, %tmp9_20 : tensor<1024xf32> loc(#loc72) + %tmp11_22 = math.exp %tmp11_21 : tensor<1024xf32> loc(#loc73) + %tmp11_23 = arith.addf %tmp11_22, %tmp11 : tensor<1024xf32> loc(#loc74) + %tmp11_24 = arith.divf %tmp11, %tmp11_23 : tensor<1024xf32> loc(#loc75) + %tmp12 = arith.mulf %tmp9_20, %tmp11_24 : tensor<1024xf32> loc(#loc63) + %tmp14_25 = arith.addi %tmp9, %tmp14 : tensor<1024xi32> loc(#loc43) + %tmp14_26 = arith.addi %tmp14_25, %tmp9_15 : tensor<1024xi32> loc(#loc64) + %tmp14_27 = tt.addptr %tmp9_17, %tmp14_26 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc65) + %tmp14_28 = tt.load %tmp14_27, %tmp6, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc66) + %tmp14_29 = arith.extf %tmp14_28 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc67) + %tmp15 = arith.mulf %tmp12, %tmp14_29 : tensor<1024xf32> loc(#loc68) + %tmp17 = arith.select %tmp6, %tmp15, %cst_2 : tensor<1024xi1>, tensor<1024xf32> loc(#loc69) + %tmp18 = arith.select %tmp4_8, %tmp5_14, %tmp17 : tensor<1024xi1>, tensor<1024xf32> loc(#loc70) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc35) + %1 = tt.addptr %0, %xindex_7 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc35) + %2 = arith.truncf %tmp18 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc36) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4) +#loc42 = loc("tmp11"(#loc2)) +#loc43 = loc("tmp14"(#loc3)) +#loc44 = loc("tmp5"(#loc4)) +#loc45 = loc("xoffset"(#loc5)) +#loc46 = loc("xoffset"(#loc6)) +#loc47 = loc("xindex"(#loc7)) +#loc48 = loc("xindex"(#loc8)) +#loc49 = loc("x0"(#loc9)) +#loc50 = loc("x1"(#loc10)) +#loc51 = loc("tmp4"(#loc11)) +#loc52 = loc("tmp5"(#loc12)) +#loc53 = loc("tmp5"(#loc13)) +#loc54 = loc("tmp5"(#loc14)) +#loc55 = loc("tmp5"(#loc15)) +#loc56 = loc("tmp6"(#loc16)) +#loc57 = loc("tmp9"(#loc17)) +#loc58 = loc("tmp9"(#loc18)) +#loc59 = loc("tmp9"(#loc19)) +#loc60 = loc("tmp9"(#loc20)) +#loc61 = loc("tmp9"(#loc21)) +#loc62 = loc("tmp9"(#loc22)) +#loc63 = loc("tmp12"(#loc27)) +#loc64 = loc("tmp14"(#loc28)) +#loc65 = loc("tmp14"(#loc29)) +#loc66 = loc("tmp14"(#loc30)) +#loc67 = loc("tmp14"(#loc31)) +#loc68 = loc("tmp15"(#loc32)) +#loc69 = loc("tmp17"(#loc33)) +#loc70 = loc("tmp18"(#loc34)) +#loc71 = loc(callsite(#loc1 at #loc42)) +#loc72 = loc(callsite(#loc23 at #loc42)) +#loc73 = loc(callsite(#loc24 at #loc42)) +#loc74 = loc(callsite(#loc25 at #loc42)) +#loc75 = loc(callsite(#loc26 at #loc42)) diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4f196ce56abae6a12888f0787e904f9939d4e900 --- /dev/null +++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}} \ No newline at end of file diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d112e25531c1d164834cc36957ee3a3323e6e5f9 Binary files /dev/null and b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0adb75b03d2d4461e712081b76e7a8566d2ff5c4 --- /dev/null +++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"hash": "062591a4393987cbf2bec755aca3eaaed3b76d84f31e49d4c05bc4c1e6fb1950", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"} \ No newline at end of file diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..fcea99d2c79476dd2b0b0964e54b8fb42dff0453 --- /dev/null +++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir @@ -0,0 +1,664 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %13 = shl i32 %12, 2, !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %15 = and i32 %14, 96, !dbg !10 + %16 = lshr exact i32 %15, 5, !dbg !10 + %17 = or disjoint i32 %16, %13, !dbg !11 + %18 = shl nuw nsw i32 %14, 1, !dbg !12 + %19 = and i32 %18, 62, !dbg !12 + %20 = sdiv i32 %17, 32, !dbg !13 + %21 = shl i32 %17, 7 + %22 = shl i32 %20, 15 + %23 = add i32 %22, %21 + %24 = add i32 %23, 4096 + %25 = zext nneg i32 %19 to i64, !dbg !14 + %26 = or disjoint i32 %24, %19, !dbg !15 + %27 = sext i32 %26 to i64, !dbg !16 + %28 = getelementptr bfloat, ptr addrspace(1) %2, i64 %27, !dbg !16 + %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %30 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %28, i64 %29, i1 true) #6, !dbg !17 + %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !17 + %32 = extractelement <2 x bfloat> %31, i64 0, !dbg !17 + %33 = extractelement <2 x bfloat> %31, i64 1, !dbg !17 + %34 = fpext bfloat %32 to float, !dbg !18 + %35 = fpext bfloat %33 to float, !dbg !18 + %36 = or disjoint i32 %23, %19, !dbg !19 + %37 = sext i32 %36 to i64, !dbg !20 + %38 = getelementptr bfloat, ptr addrspace(1) %2, i64 %37, !dbg !20 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %40 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %38, i64 %39, i1 true) #6, !dbg !21 + %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !21 + %42 = extractelement <2 x bfloat> %41, i64 0, !dbg !21 + %43 = extractelement <2 x bfloat> %41, i64 1, !dbg !21 + %44 = fpext bfloat %42 to float, !dbg !22 + %45 = fpext bfloat %43 to float, !dbg !22 + %46 = fmul float %34, %34, !dbg !23 + %47 = fmul float %35, %35, !dbg !23 + %48 = fmul float %44, %44, !dbg !24 + %49 = fmul float %45, %45, !dbg !24 + %50 = or disjoint i32 %19, 64, !dbg !25 + %51 = or disjoint i32 %24, %50, !dbg !15 + %52 = sext i32 %51 to i64, !dbg !16 + %53 = getelementptr bfloat, ptr addrspace(1) %2, i64 %52, !dbg !16 + %54 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %55 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %53, i64 %54, i1 true) #6, !dbg !17 + %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !17 + %57 = extractelement <2 x bfloat> %56, i64 0, !dbg !17 + %58 = extractelement <2 x bfloat> %56, i64 1, !dbg !17 + %59 = fpext bfloat %57 to float, !dbg !18 + %60 = fpext bfloat %58 to float, !dbg !18 + %61 = or disjoint i32 %23, %50, !dbg !19 + %62 = sext i32 %61 to i64, !dbg !20 + %63 = getelementptr bfloat, ptr addrspace(1) %2, i64 %62, !dbg !20 + %64 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %65 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %63, i64 %64, i1 true) #6, !dbg !21 + %66 = bitcast i32 %65 to <2 x bfloat>, !dbg !21 + %67 = extractelement <2 x bfloat> %66, i64 0, !dbg !21 + %68 = extractelement <2 x bfloat> %66, i64 1, !dbg !21 + %69 = fpext bfloat %67 to float, !dbg !22 + %70 = fpext bfloat %68 to float, !dbg !22 + %71 = fmul float %59, %59, !dbg !23 + %72 = fmul float %60, %60, !dbg !23 + %73 = fadd float %46, %71, !dbg !26 + %74 = fadd float %47, %72, !dbg !26 + %75 = fmul float %69, %69, !dbg !24 + %76 = fmul float %70, %70, !dbg !24 + %77 = fadd float %48, %75, !dbg !27 + %78 = fadd float %49, %76, !dbg !27 + %79 = and i32 %14, 3, !dbg !10 + %80 = or disjoint i32 %13, %79, !dbg !11 + %81 = and i32 %14, 124, !dbg !12 + %82 = lshr exact i32 %81, 2, !dbg !12 + %83 = sdiv i32 %80, 32, !dbg !13 + %84 = fadd float %73, %74, !dbg !28 + %85 = bitcast float %84 to i32, !dbg !31 + %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 16, i32 31), !dbg !31 + %87 = bitcast i32 %86 to float, !dbg !31 + %88 = fadd float %84, %87, !dbg !28 + %89 = bitcast float %88 to i32, !dbg !31 + %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 8, i32 31), !dbg !31 + %91 = bitcast i32 %90 to float, !dbg !31 + %92 = fadd float %88, %91, !dbg !28 + %93 = bitcast float %92 to i32, !dbg !31 + %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 4, i32 31), !dbg !31 + %95 = bitcast i32 %94 to float, !dbg !31 + %96 = fadd float %92, %95, !dbg !28 + %97 = bitcast float %96 to i32, !dbg !31 + %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 2, i32 31), !dbg !31 + %99 = bitcast i32 %98 to float, !dbg !31 + %100 = fadd float %96, %99, !dbg !28 + %101 = bitcast float %100 to i32, !dbg !31 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 1, i32 31), !dbg !31 + %103 = bitcast i32 %102 to float, !dbg !31 + %104 = fadd float %100, %103, !dbg !28 + %105 = fadd float %77, %78, !dbg !34 + %106 = bitcast float %105 to i32, !dbg !35 + %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !35 + %108 = bitcast i32 %107 to float, !dbg !35 + %109 = fadd float %105, %108, !dbg !34 + %110 = bitcast float %109 to i32, !dbg !35 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 8, i32 31), !dbg !35 + %112 = bitcast i32 %111 to float, !dbg !35 + %113 = fadd float %109, %112, !dbg !34 + %114 = bitcast float %113 to i32, !dbg !35 + %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 4, i32 31), !dbg !35 + %116 = bitcast i32 %115 to float, !dbg !35 + %117 = fadd float %113, %116, !dbg !34 + %118 = bitcast float %117 to i32, !dbg !35 + %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 2, i32 31), !dbg !35 + %120 = bitcast i32 %119 to float, !dbg !35 + %121 = fadd float %117, %120, !dbg !34 + %122 = bitcast float %121 to i32, !dbg !35 + %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 1, i32 31), !dbg !35 + %124 = bitcast i32 %123 to float, !dbg !35 + %125 = fadd float %121, %124, !dbg !34 + %126 = shl i32 %20, 7, !dbg !37 + %127 = tail call float @llvm.nvvm.div.full(float %125, float 1.280000e+02), !dbg !38 + %128 = fadd float %127, 0x3EB0C6F7A0000000, !dbg !39 + %129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %.not.i = icmp eq i32 %129, 0, !dbg !40 + br i1 %.not.i, label %132, label %130, !dbg !40 + +130: ; preds = %11 + %131 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit, !dbg !40 + +132: ; preds = %11 + %133 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit, !dbg !40 + +__nv_rsqrtf.exit: ; preds = %130, %132 + %.0.i = phi float [ %131, %130 ], [ %133, %132 ], !dbg !40 + %134 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %.not.i3 = icmp eq i32 %134, 0, !dbg !40 + br i1 %.not.i3, label %137, label %135, !dbg !40 + +135: ; preds = %__nv_rsqrtf.exit + %136 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit5, !dbg !40 + +137: ; preds = %__nv_rsqrtf.exit + %138 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit5, !dbg !40 + +__nv_rsqrtf.exit5: ; preds = %135, %137 + %.0.i4 = phi float [ %136, %135 ], [ %138, %137 ], !dbg !40 + %139 = lshr exact i32 %15, 3, !dbg !41 + %140 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %139, !dbg !41 + store float %.0.i, ptr addrspace(3) %140, align 4, !dbg !41 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41 + %141 = shl nuw nsw i32 %79, 2, !dbg !41 + %142 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %141, !dbg !41 + %143 = load float, ptr addrspace(3) %142, align 4, !dbg !41 + %144 = tail call float @llvm.nvvm.div.full(float %104, float 1.280000e+02), !dbg !42 + %145 = fadd float %144, 0x3EB0C6F7A0000000, !dbg !43 + %146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i6 = icmp eq i32 %146, 0, !dbg !44 + br i1 %.not.i6, label %149, label %147, !dbg !44 + +147: ; preds = %__nv_rsqrtf.exit5 + %148 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit8, !dbg !44 + +149: ; preds = %__nv_rsqrtf.exit5 + %150 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit8, !dbg !44 + +__nv_rsqrtf.exit8: ; preds = %147, %149 + %.0.i7 = phi float [ %148, %147 ], [ %150, %149 ], !dbg !44 + %151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i9 = icmp eq i32 %151, 0, !dbg !44 + br i1 %.not.i9, label %154, label %152, !dbg !44 + +152: ; preds = %__nv_rsqrtf.exit8 + %153 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit11, !dbg !44 + +154: ; preds = %__nv_rsqrtf.exit8 + %155 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit11, !dbg !44 + +__nv_rsqrtf.exit11: ; preds = %152, %154 + %.0.i10 = phi float [ %153, %152 ], [ %155, %154 ], !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + store float %.0.i7, ptr addrspace(3) %140, align 4, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %156 = load float, ptr addrspace(3) %142, align 4, !dbg !45 + %157 = shl i32 %17, 7, !dbg !46 + %158 = and i32 %82, 1 + %.masked = and i32 %82, 30 + %159 = shl nuw nsw i32 %14, 3 + %160 = and i32 %159, 120 + %161 = shl nuw nsw i32 %15, 2 + %162 = lshr i32 %14, 2 + %163 = and i32 %162, 4 + %164 = or disjoint i32 %160, %161 + %165 = xor i32 %164, %15 + %166 = or disjoint i32 %165, %163 + %167 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %166 + %168 = xor i32 %166, 516 + %169 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %168 + %170 = shl nuw nsw i32 %14, 7 + %171 = and i32 %170, 896 + %172 = shl nuw nsw i32 %79, 5 + %173 = xor i32 %172, %81 + %174 = or disjoint i32 %173, %171 + %175 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %174 + %176 = xor i32 %174, 4 + %177 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %176 + %178 = icmp eq i32 %158, 0 + %179 = shl i32 %80, 7 + %180 = shl i32 %83, 15 + %181 = add i32 %180, %179 + %182 = icmp ne i32 %158, 0 + %183 = add i32 %181, 4097 + %184 = add i32 %181, 4096 + %185 = shl nuw nsw i32 %79, 7 + %186 = and i32 %14, 28 + %187 = lshr i32 %14, 4 + %188 = and i32 %187, 2 + %189 = lshr i32 %14, 1 + %190 = and i32 %189, 32 + %191 = or disjoint i32 %185, %188 + %192 = or disjoint i32 %172, %186 + %193 = xor i32 %192, %190 + %194 = or disjoint i32 %193, %191 + %195 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %194 + %196 = xor i32 %194, 64 + %197 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %196 + %198 = shl nuw nsw i32 %79, 3 + %199 = shl nuw nsw i32 %14, 2 + %200 = and i32 %199, 480 + %201 = and i32 %189, 2 + %202 = or disjoint i32 %198, %200 + %203 = xor i32 %202, %15 + %204 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %201 + %205 = getelementptr inbounds nuw i8, ptr addrspace(3) %204, i32 %203 + %206 = getelementptr inbounds nuw i8, ptr addrspace(3) %205, i32 4 + %207 = zext nneg i32 %.masked to i64, !dbg !47 + %208 = sext i32 %126 to i64, !dbg !47 + %209 = sext i32 %157 to i64, !dbg !47 + br label %210, !dbg !47 + +210: ; preds = %__nv_rsqrtf.exit11, %210 + %211 = phi i1 [ true, %__nv_rsqrtf.exit11 ], [ false, %210 ] + %indvars.iv = phi i64 [ 0, %__nv_rsqrtf.exit11 ], [ 64, %210 ] + %212 = or disjoint i64 %indvars.iv, %25, !dbg !48 + %213 = or disjoint i64 %indvars.iv, %207, !dbg !49 + %214 = or disjoint i64 %213, 32, !dbg !49 + %215 = trunc nuw nsw i64 %212 to i32, !dbg !50 + %216 = or disjoint i32 %23, %215, !dbg !50 + %217 = sext i32 %216 to i64, !dbg !51 + %218 = getelementptr bfloat, ptr addrspace(1) %2, i64 %217, !dbg !51 + %219 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !52 + %220 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %218, i64 %219, i1 true) #6, !dbg !52 + %221 = bitcast i32 %220 to <2 x bfloat>, !dbg !52 + %222 = extractelement <2 x bfloat> %221, i64 0, !dbg !52 + %223 = extractelement <2 x bfloat> %221, i64 1, !dbg !52 + %224 = fpext bfloat %222 to float, !dbg !53 + %225 = fpext bfloat %223 to float, !dbg !53 + %226 = getelementptr bfloat, ptr addrspace(1) %3, i64 %212, !dbg !54 + %227 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %228 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %226, i64 %227, i1 true) #6, !dbg !55 + %229 = bitcast i32 %228 to <2 x bfloat>, !dbg !55 + %230 = extractelement <2 x bfloat> %229, i64 0, !dbg !55 + %231 = extractelement <2 x bfloat> %229, i64 1, !dbg !55 + %232 = fpext bfloat %230 to float, !dbg !56 + %233 = fpext bfloat %231 to float, !dbg !56 + %234 = or disjoint i64 %212, %208, !dbg !57 + %235 = getelementptr float, ptr addrspace(1) %4, i64 %234, !dbg !58 + %236 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !59 + %237 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %235, i64 %236, i1 true) #6, !dbg !59 + %238 = extractvalue { i32, i32 } %237, 0, !dbg !59 + %239 = extractvalue { i32, i32 } %237, 1, !dbg !59 + %240 = bitcast i32 %238 to float, !dbg !59 + %241 = bitcast i32 %239 to float, !dbg !59 + %242 = getelementptr float, ptr addrspace(1) %5, i64 %234, !dbg !60 + %243 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !61 + %244 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %242, i64 %243, i1 true) #6, !dbg !61 + %245 = extractvalue { i32, i32 } %244, 0, !dbg !61 + %246 = extractvalue { i32, i32 } %244, 1, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %247 = insertelement <1 x i32> poison, i32 %245, i64 0, !dbg !61 + store <1 x i32> %247, ptr addrspace(3) %167, align 4, !dbg !61 + %248 = insertelement <1 x i32> poison, i32 %246, i64 0, !dbg !61 + store <1 x i32> %248, ptr addrspace(3) %169, align 4, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %249 = load float, ptr addrspace(3) %175, align 4, !dbg !61 + %250 = load float, ptr addrspace(3) %177, align 4, !dbg !61 + %251 = or disjoint i32 %24, %215, !dbg !62 + %252 = sext i32 %251 to i64, !dbg !63 + %253 = getelementptr bfloat, ptr addrspace(1) %2, i64 %252, !dbg !63 + %254 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %255 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %253, i64 %254, i1 true) #6, !dbg !64 + %256 = bitcast i32 %255 to <2 x bfloat>, !dbg !64 + %257 = extractelement <2 x bfloat> %256, i64 0, !dbg !64 + %258 = extractelement <2 x bfloat> %256, i64 1, !dbg !64 + %259 = fpext bfloat %257 to float, !dbg !65 + %260 = fpext bfloat %258 to float, !dbg !65 + %261 = getelementptr bfloat, ptr addrspace(1) %6, i64 %212, !dbg !66 + %262 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !67 + %263 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %261, i64 %262, i1 true) #6, !dbg !67 + %264 = bitcast i32 %263 to <2 x bfloat>, !dbg !67 + %265 = extractelement <2 x bfloat> %264, i64 0, !dbg !67 + %266 = extractelement <2 x bfloat> %264, i64 1, !dbg !67 + %267 = fpext bfloat %265 to float, !dbg !68 + %268 = fpext bfloat %266 to float, !dbg !68 + %269 = or disjoint i64 %213, 1, !dbg !69 + %270 = or disjoint i64 %213, 33, !dbg !69 + %271 = trunc nuw nsw i64 %269 to i32, !dbg !70 + %272 = or disjoint i32 %181, %271, !dbg !70 + %273 = trunc nuw nsw i64 %270 to i32, !dbg !70 + %274 = or disjoint i32 %181, %273, !dbg !70 + %275 = sext i32 %272 to i64, !dbg !71 + %276 = getelementptr bfloat, ptr addrspace(1) %2, i64 %275, !dbg !71 + %277 = sext i32 %274 to i64, !dbg !71 + %278 = getelementptr bfloat, ptr addrspace(1) %2, i64 %277, !dbg !71 + %279 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %280 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %276, i64 %279, i1 %178) #6, !dbg !72 + %281 = bitcast i16 %280 to bfloat, !dbg !72 + %282 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %283 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %278, i64 %282, i1 %178) #6, !dbg !72 + %284 = bitcast i16 %283 to bfloat, !dbg !72 + %285 = fpext bfloat %281 to float, !dbg !73 + %286 = fpext bfloat %284 to float, !dbg !73 + %287 = fmul float %143, %285, !dbg !41 + %288 = fmul float %143, %286, !dbg !41 + %289 = getelementptr bfloat, ptr addrspace(1) %3, i64 %269, !dbg !74 + %290 = getelementptr bfloat, ptr addrspace(1) %3, i64 %270, !dbg !74 + %291 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %292 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %289, i64 %291, i1 %178) #6, !dbg !75 + %293 = bitcast i16 %292 to bfloat, !dbg !75 + %294 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %295 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %294, i1 %178) #6, !dbg !75 + %296 = bitcast i16 %295 to bfloat, !dbg !75 + %297 = fpext bfloat %293 to float, !dbg !76 + %298 = fpext bfloat %296 to float, !dbg !76 + %299 = fmul float %287, %297, !dbg !77 + %300 = fmul float %288, %298, !dbg !77 + %301 = fsub float 0.000000e+00, %299, !dbg !78 + %302 = fsub float 0.000000e+00, %300, !dbg !78 + %303 = trunc nuw nsw i64 %213 to i32, !dbg !79 + %304 = or disjoint i32 %181, %303, !dbg !79 + %305 = trunc nuw nsw i64 %214 to i32, !dbg !79 + %306 = or disjoint i32 %181, %305, !dbg !79 + %307 = sext i32 %304 to i64, !dbg !80 + %308 = getelementptr bfloat, ptr addrspace(1) %2, i64 %307, !dbg !80 + %309 = sext i32 %306 to i64, !dbg !80 + %310 = getelementptr bfloat, ptr addrspace(1) %2, i64 %309, !dbg !80 + %311 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %312 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %308, i64 %311, i1 %182) #6, !dbg !81 + %313 = bitcast i16 %312 to bfloat, !dbg !81 + %314 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %315 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %310, i64 %314, i1 %182) #6, !dbg !81 + %316 = bitcast i16 %315 to bfloat, !dbg !81 + %317 = fpext bfloat %313 to float, !dbg !82 + %318 = fpext bfloat %316 to float, !dbg !82 + %319 = fmul float %143, %317, !dbg !83 + %320 = fmul float %143, %318, !dbg !83 + %321 = getelementptr bfloat, ptr addrspace(1) %3, i64 %213, !dbg !84 + %322 = getelementptr bfloat, ptr addrspace(1) %3, i64 %214, !dbg !84 + %323 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %324 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %321, i64 %323, i1 %182) #6, !dbg !85 + %325 = bitcast i16 %324 to bfloat, !dbg !85 + %326 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %327 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %322, i64 %326, i1 %182) #6, !dbg !85 + %328 = bitcast i16 %327 to bfloat, !dbg !85 + %329 = fpext bfloat %325 to float, !dbg !86 + %330 = fpext bfloat %328 to float, !dbg !86 + %331 = fmul float %319, %329, !dbg !87 + %332 = fmul float %320, %330, !dbg !87 + %333 = select i1 %178, float %301, float %331, !dbg !88 + %334 = select i1 %178, float %302, float %332, !dbg !88 + %335 = fmul float %.0.i4, %224, !dbg !89 + %336 = fmul float %.0.i4, %225, !dbg !89 + %337 = fmul float %335, %232, !dbg !90 + %338 = fmul float %336, %233, !dbg !90 + %339 = fmul float %337, %240, !dbg !91 + %340 = fmul float %338, %241, !dbg !91 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91 + store float %339, ptr addrspace(3) %167, align 4, !dbg !91 + store float %340, ptr addrspace(3) %169, align 4, !dbg !91 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91 + %341 = load float, ptr addrspace(3) %175, align 4, !dbg !91 + %342 = load float, ptr addrspace(3) %177, align 4, !dbg !91 + %343 = fmul float %249, %333, !dbg !92 + %344 = fmul float %250, %334, !dbg !92 + %345 = fadd float %343, %341, !dbg !93 + %346 = fadd float %344, %342, !dbg !93 + %347 = or disjoint i32 %183, %303, !dbg !94 + %348 = or disjoint i32 %183, %305, !dbg !94 + %349 = sext i32 %347 to i64, !dbg !95 + %350 = getelementptr bfloat, ptr addrspace(1) %2, i64 %349, !dbg !95 + %351 = sext i32 %348 to i64, !dbg !95 + %352 = getelementptr bfloat, ptr addrspace(1) %2, i64 %351, !dbg !95 + %353 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %354 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %350, i64 %353, i1 %178) #6, !dbg !96 + %355 = bitcast i16 %354 to bfloat, !dbg !96 + %356 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %357 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %352, i64 %356, i1 %178) #6, !dbg !96 + %358 = bitcast i16 %357 to bfloat, !dbg !96 + %359 = fpext bfloat %355 to float, !dbg !97 + %360 = fpext bfloat %358 to float, !dbg !97 + %361 = fmul float %156, %359, !dbg !45 + %362 = fmul float %156, %360, !dbg !45 + %363 = getelementptr bfloat, ptr addrspace(1) %6, i64 %269, !dbg !98 + %364 = getelementptr bfloat, ptr addrspace(1) %6, i64 %270, !dbg !98 + %365 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %366 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %363, i64 %365, i1 %178) #6, !dbg !99 + %367 = bitcast i16 %366 to bfloat, !dbg !99 + %368 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %369 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %364, i64 %368, i1 %178) #6, !dbg !99 + %370 = bitcast i16 %369 to bfloat, !dbg !99 + %371 = fpext bfloat %367 to float, !dbg !100 + %372 = fpext bfloat %370 to float, !dbg !100 + %373 = fmul float %361, %371, !dbg !101 + %374 = fmul float %362, %372, !dbg !101 + %375 = fsub float 0.000000e+00, %373, !dbg !102 + %376 = fsub float 0.000000e+00, %374, !dbg !102 + %377 = or disjoint i32 %184, %303, !dbg !103 + %378 = or disjoint i32 %184, %305, !dbg !103 + %379 = sext i32 %377 to i64, !dbg !104 + %380 = getelementptr bfloat, ptr addrspace(1) %2, i64 %379, !dbg !104 + %381 = sext i32 %378 to i64, !dbg !104 + %382 = getelementptr bfloat, ptr addrspace(1) %2, i64 %381, !dbg !104 + %383 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %384 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %380, i64 %383, i1 %182) #6, !dbg !105 + %385 = bitcast i16 %384 to bfloat, !dbg !105 + %386 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %387 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %382, i64 %386, i1 %182) #6, !dbg !105 + %388 = bitcast i16 %387 to bfloat, !dbg !105 + %389 = fpext bfloat %385 to float, !dbg !106 + %390 = fpext bfloat %388 to float, !dbg !106 + %391 = fmul float %156, %389, !dbg !107 + %392 = fmul float %156, %390, !dbg !107 + %393 = getelementptr bfloat, ptr addrspace(1) %6, i64 %213, !dbg !108 + %394 = getelementptr bfloat, ptr addrspace(1) %6, i64 %214, !dbg !108 + %395 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %396 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %393, i64 %395, i1 %182) #6, !dbg !109 + %397 = bitcast i16 %396 to bfloat, !dbg !109 + %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %399 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %394, i64 %398, i1 %182) #6, !dbg !109 + %400 = bitcast i16 %399 to bfloat, !dbg !109 + %401 = fpext bfloat %397 to float, !dbg !110 + %402 = fpext bfloat %400 to float, !dbg !110 + %403 = fmul float %391, %401, !dbg !111 + %404 = fmul float %392, %402, !dbg !111 + %405 = select i1 %178, float %375, float %403, !dbg !88 + %406 = select i1 %178, float %376, float %404, !dbg !88 + %407 = fmul float %.0.i10, %259, !dbg !112 + %408 = fmul float %.0.i10, %260, !dbg !112 + %409 = fmul float %407, %267, !dbg !113 + %410 = fmul float %408, %268, !dbg !113 + %411 = fmul float %409, %240, !dbg !114 + %412 = fmul float %410, %241, !dbg !114 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114 + store float %411, ptr addrspace(3) %167, align 4, !dbg !114 + store float %412, ptr addrspace(3) %169, align 4, !dbg !114 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114 + %413 = load float, ptr addrspace(3) %175, align 4, !dbg !114 + %414 = load float, ptr addrspace(3) %177, align 4, !dbg !114 + %415 = fmul float %249, %405, !dbg !115 + %416 = fmul float %250, %406, !dbg !115 + %417 = fadd float %415, %413, !dbg !116 + %418 = fadd float %416, %414, !dbg !116 + %419 = or disjoint i64 %212, %209, !dbg !117 + %420 = getelementptr bfloat, ptr addrspace(1) %0, i64 %419, !dbg !118 + %421 = fptrunc float %345 to bfloat, !dbg !119 + %422 = fptrunc float %346 to bfloat, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + store bfloat %421, ptr addrspace(3) %195, align 2, !dbg !119 + store bfloat %422, ptr addrspace(3) %197, align 2, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + %423 = load bfloat, ptr addrspace(3) %205, align 2, !dbg !119 + %424 = load bfloat, ptr addrspace(3) %206, align 2, !dbg !119 + %425 = insertelement <2 x bfloat> poison, bfloat %423, i64 0, !dbg !119 + %426 = insertelement <2 x bfloat> %425, bfloat %424, i64 1, !dbg !119 + %427 = bitcast <2 x bfloat> %426 to i32, !dbg !119 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %427, ptr addrspace(1) %420, i1 true) #6, !dbg !119 + %428 = getelementptr bfloat, ptr addrspace(1) %1, i64 %419, !dbg !120 + %429 = fptrunc float %417 to bfloat, !dbg !121 + %430 = fptrunc float %418 to bfloat, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + store bfloat %429, ptr addrspace(3) %195, align 2, !dbg !121 + store bfloat %430, ptr addrspace(3) %197, align 2, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + %431 = load bfloat, ptr addrspace(3) %205, align 2, !dbg !121 + %432 = load bfloat, ptr addrspace(3) %206, align 2, !dbg !121 + %433 = insertelement <2 x bfloat> poison, bfloat %431, i64 0, !dbg !121 + %434 = insertelement <2 x bfloat> %433, bfloat %432, i64 1, !dbg !121 + %435 = bitcast <2 x bfloat> %434 to i32, !dbg !121 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %435, ptr addrspace(1) %428, i1 true) #6, !dbg !121 + br i1 %211, label %210, label %436, !dbg !47 + +436: ; preds = %210 + ret void, !dbg !122 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 23, column: 33, scope: !5) +!10 = !DILocation(line: 24, column: 44, scope: !5) +!11 = !DILocation(line: 24, column: 23, scope: !5) +!12 = !DILocation(line: 26, column: 37, scope: !5) +!13 = !DILocation(line: 29, column: 19, scope: !5) +!14 = !DILocation(line: 33, column: 43, scope: !5) +!15 = !DILocation(line: 39, column: 57, scope: !5) +!16 = !DILocation(line: 39, column: 34, scope: !5) +!17 = !DILocation(line: 39, column: 68, scope: !5) +!18 = !DILocation(line: 39, column: 121, scope: !5) +!19 = !DILocation(line: 40, column: 50, scope: !5) +!20 = !DILocation(line: 40, column: 34, scope: !5) +!21 = !DILocation(line: 40, column: 61, scope: !5) +!22 = !DILocation(line: 40, column: 114, scope: !5) +!23 = !DILocation(line: 42, column: 22, scope: !5) +!24 = !DILocation(line: 47, column: 22, scope: !5) +!25 = !DILocation(line: 34, column: 31, scope: !5) +!26 = !DILocation(line: 44, column: 23, scope: !5) +!27 = !DILocation(line: 49, column: 25, scope: !5) +!28 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !31) +!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0) +!30 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!31 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !32) +!32 = !DILocation(line: 51, column: 25, scope: !33) +!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!34 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !35) +!35 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !36) +!36 = !DILocation(line: 52, column: 27, scope: !33) +!37 = !DILocation(line: 63, column: 46, scope: !5) +!38 = !DILocation(line: 75, column: 25, scope: !5) +!39 = !DILocation(line: 77, column: 24, scope: !5) +!40 = !DILocation(line: 78, column: 32, scope: !5) +!41 = !DILocation(line: 79, column: 24, scope: !5) +!42 = !DILocation(line: 123, column: 24, scope: !5) +!43 = !DILocation(line: 124, column: 24, scope: !5) +!44 = !DILocation(line: 125, column: 32, scope: !5) +!45 = !DILocation(line: 126, column: 24, scope: !5) +!46 = !DILocation(line: 161, column: 43, scope: !5) +!47 = !DILocation(line: 53, column: 43, scope: !5) +!48 = !DILocation(line: 54, column: 31, scope: !5) +!49 = !DILocation(line: 72, column: 41, scope: !5) +!50 = !DILocation(line: 61, column: 51, scope: !5) +!51 = !DILocation(line: 61, column: 35, scope: !5) +!52 = !DILocation(line: 61, column: 62, scope: !5) +!53 = !DILocation(line: 61, column: 115, scope: !5) +!54 = !DILocation(line: 62, column: 35, scope: !5) +!55 = !DILocation(line: 62, column: 42, scope: !5) +!56 = !DILocation(line: 62, column: 95, scope: !5) +!57 = !DILocation(line: 63, column: 42, scope: !5) +!58 = !DILocation(line: 63, column: 35, scope: !5) +!59 = !DILocation(line: 63, column: 51, scope: !5) +!60 = !DILocation(line: 64, column: 35, scope: !5) +!61 = !DILocation(line: 64, column: 51, scope: !5) +!62 = !DILocation(line: 65, column: 58, scope: !5) +!63 = !DILocation(line: 65, column: 35, scope: !5) +!64 = !DILocation(line: 65, column: 69, scope: !5) +!65 = !DILocation(line: 65, column: 123, scope: !5) +!66 = !DILocation(line: 66, column: 36, scope: !5) +!67 = !DILocation(line: 66, column: 43, scope: !5) +!68 = !DILocation(line: 66, column: 96, scope: !5) +!69 = !DILocation(line: 72, column: 39, scope: !5) +!70 = !DILocation(line: 72, column: 57, scope: !5) +!71 = !DILocation(line: 72, column: 35, scope: !5) +!72 = !DILocation(line: 72, column: 68, scope: !5) +!73 = !DILocation(line: 72, column: 129, scope: !5) +!74 = !DILocation(line: 80, column: 35, scope: !5) +!75 = !DILocation(line: 80, column: 85, scope: !5) +!76 = !DILocation(line: 80, column: 146, scope: !5) +!77 = !DILocation(line: 82, column: 24, scope: !5) +!78 = !DILocation(line: 84, column: 17, scope: !5) +!79 = !DILocation(line: 90, column: 53, scope: !5) +!80 = !DILocation(line: 90, column: 35, scope: !5) +!81 = !DILocation(line: 90, column: 64, scope: !5) +!82 = !DILocation(line: 90, column: 125, scope: !5) +!83 = !DILocation(line: 97, column: 24, scope: !5) +!84 = !DILocation(line: 98, column: 35, scope: !5) +!85 = !DILocation(line: 98, column: 81, scope: !5) +!86 = !DILocation(line: 98, column: 142, scope: !5) +!87 = !DILocation(line: 100, column: 24, scope: !5) +!88 = !DILocation(line: 0, scope: !5) +!89 = !DILocation(line: 111, column: 24, scope: !5) +!90 = !DILocation(line: 113, column: 24, scope: !5) +!91 = !DILocation(line: 116, column: 24, scope: !5) +!92 = !DILocation(line: 118, column: 24, scope: !5) +!93 = !DILocation(line: 119, column: 24, scope: !5) +!94 = !DILocation(line: 121, column: 60, scope: !5) +!95 = !DILocation(line: 121, column: 35, scope: !5) +!96 = !DILocation(line: 121, column: 71, scope: !5) +!97 = !DILocation(line: 121, column: 132, scope: !5) +!98 = !DILocation(line: 127, column: 35, scope: !5) +!99 = !DILocation(line: 127, column: 85, scope: !5) +!100 = !DILocation(line: 127, column: 146, scope: !5) +!101 = !DILocation(line: 129, column: 24, scope: !5) +!102 = !DILocation(line: 131, column: 17, scope: !5) +!103 = !DILocation(line: 134, column: 60, scope: !5) +!104 = !DILocation(line: 134, column: 35, scope: !5) +!105 = !DILocation(line: 134, column: 71, scope: !5) +!106 = !DILocation(line: 134, column: 132, scope: !5) +!107 = !DILocation(line: 139, column: 24, scope: !5) +!108 = !DILocation(line: 140, column: 35, scope: !5) +!109 = !DILocation(line: 140, column: 81, scope: !5) +!110 = !DILocation(line: 140, column: 142, scope: !5) +!111 = !DILocation(line: 142, column: 24, scope: !5) +!112 = !DILocation(line: 151, column: 25, scope: !5) +!113 = !DILocation(line: 153, column: 26, scope: !5) +!114 = !DILocation(line: 156, column: 26, scope: !5) +!115 = !DILocation(line: 158, column: 26, scope: !5) +!116 = !DILocation(line: 159, column: 26, scope: !5) +!117 = !DILocation(line: 161, column: 39, scope: !5) +!118 = !DILocation(line: 161, column: 32, scope: !5) +!119 = !DILocation(line: 161, column: 55, scope: !5) +!120 = !DILocation(line: 162, column: 32, scope: !5) +!121 = !DILocation(line: 162, column: 56, scope: !5) +!122 = !DILocation(line: 53, column: 4, scope: !5) diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..8ba494e4a77bc50970fb7c8a5d907621a088bb1e --- /dev/null +++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx @@ -0,0 +1,1188 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10 +) +.reqntid 128 +{ + .reg .pred %p<6>; + .reg .b16 %rs<42>; + .reg .b32 %r<217>; + .reg .b64 %rd<96>; + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6]; + ld.param.b64 %rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5]; + ld.param.b64 %rd9, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4]; + ld.param.b64 %rd8, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3]; + ld.param.b64 %rd7, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2]; + ld.param.b64 %rd6, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1]; + ld.param.b64 %rd5, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28 + mov.u32 %r23, %ctaid.x; + .loc 1 23 33 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33 + shl.b32 %r24, %r23, 2; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + mov.u32 %r25, %tid.x; + and.b32 %r26, %r25, 96; + bfe.u32 %r27, %r25, 5, 2; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r28, %r27, %r24; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + shl.b32 %r29, %r25, 1; + and.b32 %r30, %r29, 62; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + bfe.s32 %r31, %r23, 29, 1; + shr.u32 %r32, %r31, 27; + add.s32 %r33, %r28, %r32; + shr.s32 %r34, %r33, 5; + shl.b32 %r35, %r28, 7; + shl.b32 %r36, %r34, 15; + add.s32 %r1, %r36, %r35; + add.s32 %r2, %r1, 4096; + .loc 1 33 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43 + cvt.u64.u32 %rd1, %r30; + .loc 1 39 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57 + or.b32 %r37, %r2, %r30; + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + mad.wide.s32 %rd12, %r37, 2, %rd7; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0; + // end inline asm + mov.b32 %r19, 0; + mov.pred %p2, -1; + // begin inline asm + mov.u32 %r18, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r18 }, [ %rd12 + 0 ], %rd13; + // end inline asm + mov.b32 {%rs1, %rs2}, %r18; + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r38, %rs1; + cvt.f32.bf16 %r39, %rs2; + .loc 1 40 50 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:50 + or.b32 %r40, %r1, %r30; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + mad.wide.s32 %rd14, %r40, 2, %rd7; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r20, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r20 }, [ %rd14 + 0 ], %rd15; + // end inline asm + mov.b32 {%rs3, %rs4}, %r20; + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r41, %rs3; + cvt.f32.bf16 %r42, %rs4; + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + cvt.s64.s32 %rd20, %r2; + or.b64 %rd21, %rd20, %rd1; + shl.b64 %rd22, %rd21, 1; + add.s64 %rd23, %rd7, %rd22; + add.s64 %rd16, %rd23, 128; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r21, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r21 }, [ %rd16 + 0 ], %rd17; + // end inline asm + mov.b32 {%rs5, %rs6}, %r21; + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r43, %rs5; + cvt.f32.bf16 %r44, %rs6; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + cvt.s64.s32 %rd24, %r1; + or.b64 %rd25, %rd24, %rd1; + shl.b64 %rd26, %rd25, 1; + add.s64 %rd27, %rd7, %rd26; + add.s64 %rd18, %rd27, 128; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r22, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r22 }, [ %rd18 + 0 ], %rd19; + // end inline asm + mov.b32 {%rs7, %rs8}, %r22; + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r45, %rs7; + cvt.f32.bf16 %r46, %rs8; + .loc 1 42 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22 + mul.f32 %r47, %r43, %r43; + mul.f32 %r48, %r44, %r44; + .loc 1 44 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:44:23 + fma.rn.f32 %r49, %r38, %r38, %r47; + fma.rn.f32 %r50, %r39, %r39, %r48; + .loc 1 47 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22 + mul.f32 %r51, %r45, %r45; + mul.f32 %r52, %r46, %r46; + .loc 1 49 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25 + fma.rn.f32 %r53, %r41, %r41, %r51; + fma.rn.f32 %r54, %r42, %r42, %r52; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + and.b32 %r55, %r25, 3; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r56, %r24, %r55; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + and.b32 %r57, %r25, 124; + bfe.u32 %r58, %r25, 2, 5; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + add.s32 %r59, %r56, %r32; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r60, %r49, %r50; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r62, %r60, %r61; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r63, %r62, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r64, %r62, %r63; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r65, %r64, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r66, %r64, %r65; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r67, %r66, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r68, %r66, %r67; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r69, %r68, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r70, %r68, %r69; +$L__tmp12: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r71, %r53, %r54; +$L__tmp13: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r72, %r71, 16, 31, -1; +$L__tmp14: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r73, %r71, %r72; +$L__tmp15: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r74, %r73, 8, 31, -1; +$L__tmp16: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r75, %r73, %r74; +$L__tmp17: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r76, %r75, 4, 31, -1; +$L__tmp18: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r77, %r75, %r76; +$L__tmp19: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r78, %r77, 2, 31, -1; +$L__tmp20: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r79, %r77, %r78; +$L__tmp21: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r80, %r79, 1, 31, -1; +$L__tmp22: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r81, %r79, %r80; +$L__tmp23: + .loc 1 63 46 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46 + shl.b32 %r82, %r34, 7; + mov.b32 %r83, 0f43000000; + .loc 1 75 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25 + div.full.f32 %r84, %r81, %r83; + .loc 1 77 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24 + add.f32 %r85, %r84, 0f358637BD; + .loc 1 78 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32 + rsqrt.approx.ftz.f32 %r3, %r85; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + shr.u32 %r86, %r26, 3; + mov.b32 %r87, global_smem; + add.s32 %r88, %r87, %r86; + st.shared.b32 [%r88], %r3; + bar.sync 0; + shl.b32 %r89, %r55, 2; + add.s32 %r90, %r87, %r89; + ld.shared.b32 %r4, [%r90]; + .loc 1 123 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24 + div.full.f32 %r91, %r70, %r83; + .loc 1 124 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24 + add.f32 %r92, %r91, 0f358637BD; + .loc 1 125 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32 + rsqrt.approx.ftz.f32 %r5, %r92; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + bar.sync 0; + st.shared.b32 [%r88], %r5; + bar.sync 0; + ld.shared.b32 %r6, [%r90]; + bfe.u32 %r7, %r57, 2, 1; + and.b32 %r93, %r58, 30; + shl.b32 %r94, %r25, 3; + and.b32 %r95, %r94, 120; + shl.b32 %r96, %r26, 2; + shr.u32 %r97, %r25, 2; + and.b32 %r98, %r97, 4; + or.b32 %r99, %r95, %r96; + xor.b32 %r100, %r99, %r26; + or.b32 %r101, %r100, %r98; + add.s32 %r8, %r87, %r101; + xor.b32 %r102, %r101, 4; + add.s32 %r9, %r87, %r102; + shl.b32 %r103, %r25, 7; + and.b32 %r104, %r103, 896; + shl.b32 %r105, %r55, 5; + xor.b32 %r106, %r105, %r57; + or.b32 %r107, %r106, %r104; + add.s32 %r10, %r87, %r107; + xor.b32 %r108, %r107, 4; + add.s32 %r11, %r87, %r108; + shl.b32 %r109, %r56, 7; + shl.b32 %r110, %r59, 10; + and.b32 %r111, %r110, -32768; + add.s32 %r12, %r111, %r109; + add.s32 %r13, %r12, 4097; + add.s32 %r14, %r12, 4096; + shl.b32 %r112, %r55, 7; + and.b32 %r113, %r25, 28; + shr.u32 %r114, %r25, 4; + and.b32 %r115, %r114, 2; + shr.u32 %r116, %r25, 1; + and.b32 %r117, %r116, 32; + or.b32 %r118, %r112, %r115; + or.b32 %r119, %r105, %r113; + xor.b32 %r120, %r119, %r117; + or.b32 %r121, %r120, %r118; + add.s32 %r15, %r87, %r121; + xor.b32 %r122, %r121, 64; + add.s32 %r16, %r87, %r122; + shl.b32 %r123, %r55, 3; + shl.b32 %r124, %r25, 2; + and.b32 %r125, %r124, 480; + and.b32 %r126, %r116, 2; + or.b32 %r127, %r123, %r125; + xor.b32 %r128, %r127, %r26; + add.s32 %r129, %r87, %r126; + add.s32 %r17, %r129, %r128; + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + cvt.u64.u32 %rd2, %r93; + cvt.s64.s32 %rd3, %r82; + cvt.s64.s32 %rd4, %r35; + mov.b64 %rd95, 0; + mov.pred %p5, %p2; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 0 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43 + mov.pred %p1, %p5; + setp.ne.b32 %p4, %r7, 0; + setp.eq.b32 %p3, %r7, 0; + .loc 1 54 31 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:54:31 + or.b64 %rd74, %rd95, %rd1; + .loc 1 72 41 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41 + or.b64 %rd75, %rd95, %rd2; + .loc 1 61 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:51 + cvt.u32.u64 %r140, %rd74; + or.b32 %r141, %r1, %r140; + .loc 1 61 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35 + mad.wide.s32 %rd29, %r141, 2, %rd7; + .loc 1 61 62 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62 + // begin inline asm + mov.u64 %rd28, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd28, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r130, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r130 }, [ %rd29 + 0 ], %rd28; + // end inline asm + mov.b32 {%rs26, %rs27}, %r130; + .loc 1 61 115 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115 + cvt.f32.bf16 %r142, %rs26; + cvt.f32.bf16 %r143, %rs27; + .loc 1 62 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35 + shl.b64 %rd76, %rd74, 1; + add.s64 %rd31, %rd8, %rd76; + .loc 1 62 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42 + // begin inline asm + mov.u64 %rd30, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd30, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r131, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r131 }, [ %rd31 + 0 ], %rd30; + // end inline asm + mov.b32 {%rs28, %rs29}, %r131; + .loc 1 62 95 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95 + cvt.f32.bf16 %r144, %rs28; + cvt.f32.bf16 %r145, %rs29; + .loc 1 63 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42 + or.b64 %rd77, %rd74, %rd3; + .loc 1 63 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35 + shl.b64 %rd78, %rd77, 2; + add.s64 %rd33, %rd9, %rd78; + .loc 1 63 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51 + // begin inline asm + mov.u64 %rd32, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd32, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r132, %r19; + mov.u32 %r133, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r132, %r133 }, [ %rd33 + 0 ], %rd32; + // end inline asm + .loc 1 64 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35 + add.s64 %rd35, %rd10, %rd78; + .loc 1 64 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51 + // begin inline asm + mov.u64 %rd34, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd34, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r134, %r19; + mov.u32 %r135, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r134, %r135 }, [ %rd35 + 0 ], %rd34; + // end inline asm + bar.sync 0; + st.shared.b32 [%r8], %r134; + st.shared.b32 [%r9+512], %r135; + bar.sync 0; + ld.shared.b32 %r146, [%r10]; + ld.shared.b32 %r147, [%r11]; + .loc 1 65 58 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:58 + or.b32 %r148, %r2, %r140; + .loc 1 65 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35 + mad.wide.s32 %rd37, %r148, 2, %rd7; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd36, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r136, %r19; + @%p2 ld.global.L1::evict_first.L2::cache_hint.b32 { %r136 }, [ %rd37 + 0 ], %rd36; + // end inline asm + mov.b32 {%rs30, %rs31}, %r136; + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r149, %rs30; + cvt.f32.bf16 %r150, %rs31; + .loc 1 66 36 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36 + add.s64 %rd39, %rd11, %rd76; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + // begin inline asm + mov.u64 %rd38, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd38, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r137, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r137 }, [ %rd39 + 0 ], %rd38; + // end inline asm + mov.b32 {%rs32, %rs33}, %r137; + .loc 1 66 96 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96 + cvt.f32.bf16 %r151, %rs32; + cvt.f32.bf16 %r152, %rs33; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + cvt.s64.s32 %rd79, %r12; + .loc 1 72 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:57 + cvt.u32.u64 %r153, %rd75; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + cvt.s64.s32 %rd80, %rd75; + add.s64 %rd81, %rd79, %rd80; + shl.b64 %rd82, %rd81, 1; + add.s64 %rd83, %rd7, %rd82; + add.s64 %rd41, %rd83, 2; + add.s64 %rd43, %rd83, 66; + .loc 1 72 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68 + // begin inline asm + mov.u64 %rd40, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd40, 1.0; + // end inline asm + mov.b16 %rs10, 0; + // begin inline asm + mov.u16 %rs9, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd41 + 0 ], %rd40; + // end inline asm + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd43 + 0 ], %rd42; + // end inline asm + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r154, %rs9; + cvt.f32.bf16 %r155, %rs11; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r156, %r4, %r154; + mul.f32 %r157, %r4, %r155; + .loc 1 80 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35 + shl.b64 %rd84, %rd75, 1; + add.s64 %rd53, %rd8, %rd84; + add.s64 %rd45, %rd53, 2; + add.s64 %rd47, %rd53, 66; + .loc 1 80 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85 + // begin inline asm + mov.u64 %rd44, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd44, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd45 + 0 ], %rd44; + // end inline asm + // begin inline asm + mov.u64 %rd46, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd46, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd47 + 0 ], %rd46; + // end inline asm + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r158, %rs12; + cvt.f32.bf16 %r159, %rs13; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r160, %r156; + fma.rn.f32 %r161, %r160, %r158, 0f00000000; + neg.f32 %r162, %r157; + fma.rn.f32 %r163, %r162, %r159, 0f00000000; + .loc 1 90 53 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53 + or.b32 %r164, %r12, %r153; + .loc 1 90 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35 + mad.wide.s32 %rd49, %r164, 2, %rd7; + add.s64 %rd51, %rd83, 64; + .loc 1 90 64 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64 + // begin inline asm + mov.u64 %rd48, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd48, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs14, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd49 + 0 ], %rd48; + // end inline asm + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd51 + 0 ], %rd50; + // end inline asm + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r165, %rs14; + cvt.f32.bf16 %r166, %rs15; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r167, %r4, %r165; + mul.f32 %r168, %r4, %r166; + .loc 1 98 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35 + add.s64 %rd55, %rd53, 64; + .loc 1 98 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81 + // begin inline asm + mov.u64 %rd52, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd52, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs16, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd53 + 0 ], %rd52; + // end inline asm + // begin inline asm + mov.u64 %rd54, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd54, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd55 + 0 ], %rd54; + // end inline asm + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r169, %rs16; + cvt.f32.bf16 %r170, %rs17; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r171, %r167, %r169; + mul.f32 %r172, %r168, %r170; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r173, %r161, %r171, %p3; + selp.f32 %r174, %r163, %r172, %p3; + .loc 1 111 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24 + mul.f32 %r175, %r3, %r142; + mul.f32 %r176, %r3, %r143; + .loc 1 113 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24 + mul.f32 %r177, %r175, %r144; + mul.f32 %r178, %r176, %r145; + .loc 1 116 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24 + mul.f32 %r179, %r177, %r132; + mul.f32 %r180, %r178, %r133; + bar.sync 0; + st.shared.b32 [%r8], %r179; + st.shared.b32 [%r9+512], %r180; + bar.sync 0; + ld.shared.b32 %r181, [%r10]; + ld.shared.b32 %r182, [%r11]; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r183, %r146, %r173, %r181; + fma.rn.f32 %r184, %r147, %r174, %r182; + .loc 1 121 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60 + or.b32 %r185, %r13, %r153; + .loc 1 121 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35 + mad.wide.s32 %rd57, %r185, 2, %rd7; + cvt.s64.s32 %rd85, %r13; + add.s64 %rd86, %rd85, %rd80; + shl.b64 %rd87, %rd86, 1; + add.s64 %rd88, %rd7, %rd87; + add.s64 %rd59, %rd88, 64; + .loc 1 121 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs18, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd57 + 0 ], %rd56; + // end inline asm + // begin inline asm + mov.u64 %rd58, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd58, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd59 + 0 ], %rd58; + // end inline asm + .loc 1 121 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132 + cvt.f32.bf16 %r186, %rs18; + cvt.f32.bf16 %r187, %rs19; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + mul.f32 %r188, %r6, %r186; + mul.f32 %r189, %r6, %r187; + .loc 1 127 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35 + add.s64 %rd69, %rd11, %rd84; + add.s64 %rd61, %rd69, 2; + add.s64 %rd63, %rd69, 66; + .loc 1 127 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85 + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs20, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd61 + 0 ], %rd60; + // end inline asm + // begin inline asm + mov.u64 %rd62, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd62, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd63 + 0 ], %rd62; + // end inline asm + .loc 1 127 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146 + cvt.f32.bf16 %r190, %rs20; + cvt.f32.bf16 %r191, %rs21; + .loc 1 131 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17 + neg.f32 %r192, %r188; + fma.rn.f32 %r193, %r192, %r190, 0f00000000; + neg.f32 %r194, %r189; + fma.rn.f32 %r195, %r194, %r191, 0f00000000; + .loc 1 134 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60 + or.b32 %r196, %r14, %r153; + .loc 1 134 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35 + mad.wide.s32 %rd65, %r196, 2, %rd7; + cvt.s64.s32 %rd89, %r14; + add.s64 %rd90, %rd89, %rd80; + shl.b64 %rd91, %rd90, 1; + add.s64 %rd92, %rd7, %rd91; + add.s64 %rd67, %rd92, 64; + .loc 1 134 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71 + // begin inline asm + mov.u64 %rd64, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs22, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd65 + 0 ], %rd64; + // end inline asm + // begin inline asm + mov.u64 %rd66, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd66, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd67 + 0 ], %rd66; + // end inline asm + .loc 1 134 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132 + cvt.f32.bf16 %r197, %rs22; + cvt.f32.bf16 %r198, %rs23; + .loc 1 139 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24 + mul.f32 %r199, %r6, %r197; + mul.f32 %r200, %r6, %r198; + .loc 1 140 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35 + add.s64 %rd71, %rd69, 64; + .loc 1 140 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81 + // begin inline asm + mov.u64 %rd68, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs24, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd69 + 0 ], %rd68; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs25, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd71 + 0 ], %rd70; + // end inline asm + .loc 1 140 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142 + cvt.f32.bf16 %r201, %rs24; + cvt.f32.bf16 %r202, %rs25; + .loc 1 142 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24 + mul.f32 %r203, %r199, %r201; + mul.f32 %r204, %r200, %r202; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r205, %r193, %r203, %p3; + selp.f32 %r206, %r195, %r204, %p3; + .loc 1 151 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25 + mul.f32 %r207, %r5, %r149; + mul.f32 %r208, %r5, %r150; + .loc 1 153 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26 + mul.f32 %r209, %r207, %r151; + mul.f32 %r210, %r208, %r152; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + mul.f32 %r211, %r209, %r132; + mul.f32 %r212, %r210, %r133; + bar.sync 0; + st.shared.b32 [%r8], %r211; + st.shared.b32 [%r9+512], %r212; + bar.sync 0; + ld.shared.b32 %r213, [%r10]; + ld.shared.b32 %r214, [%r11]; + .loc 1 159 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26 + fma.rn.f32 %r215, %r146, %r205, %r213; + fma.rn.f32 %r216, %r147, %r206, %r214; + .loc 1 161 39 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39 + or.b64 %rd93, %rd74, %rd4; + .loc 1 161 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32 + shl.b64 %rd94, %rd93, 1; + add.s64 %rd72, %rd5, %rd94; + .loc 1 161 55 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55 + cvt.rn.bf16.f32 %rs34, %r183; + cvt.rn.bf16.f32 %rs35, %r184; + bar.sync 0; + st.shared.b16 [%r15], %rs34; + st.shared.b16 [%r16], %rs35; + bar.sync 0; + ld.shared.b16 %rs36, [%r17]; + ld.shared.b16 %rs37, [%r17+4]; + mov.b32 %r138, {%rs36, %rs37}; + // begin inline asm + @%p2 st.global.b32 [ %rd72 + 0 ], { %r138 }; + // end inline asm + .loc 1 162 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32 + add.s64 %rd73, %rd6, %rd94; + .loc 1 162 56 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56 + cvt.rn.bf16.f32 %rs38, %r215; + cvt.rn.bf16.f32 %rs39, %r216; + bar.sync 0; + st.shared.b16 [%r15], %rs38; + st.shared.b16 [%r16], %rs39; + bar.sync 0; + ld.shared.b16 %rs40, [%r17]; + ld.shared.b16 %rs41, [%r17+4]; + mov.b32 %r139, {%rs40, %rs41}; + // begin inline asm + @%p2 st.global.b32 [ %rd73 + 0 ], { %r139 }; + // end inline asm + mov.b64 %rd95, 64; + mov.pred %p5, 0; + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + @%p1 bra $L__BB0_1; +// %bb.2: + .loc 1 53 4 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4 + ret; +$L__tmp24: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 456 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 118 +.b8 113 +.b8 104 +.b8 106 +.b8 116 +.b8 121 +.b8 103 +.b8 55 +.b8 102 +.b8 118 +.b8 120 +.b8 122 +.b8 119 +.b8 116 +.b8 98 +.b8 116 +.b8 116 +.b8 52 +.b8 118 +.b8 114 +.b8 100 +.b8 107 +.b8 98 +.b8 110 +.b8 98 +.b8 54 +.b8 110 +.b8 51 +.b8 50 +.b8 102 +.b8 110 +.b8 114 +.b8 105 +.b8 106 +.b8 106 +.b8 112 +.b8 108 +.b8 51 +.b8 118 +.b8 118 +.b8 52 +.b8 99 +.b8 102 +.b8 113 +.b8 100 +.b8 52 +.b8 109 +.b8 122 +.b8 110 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 98 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 101 +.b8 103 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 119 +.b8 105 +.b8 116 +.b8 104 +.b8 95 +.b8 115 +.b8 105 +.b8 122 +.b8 101 +.b8 115 +.b8 95 +.b8 115 +.b8 116 +.b8 97 +.b8 99 +.b8 107 +.b8 95 +.b8 117 +.b8 110 +.b8 98 +.b8 105 +.b8 110 +.b8 100 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x151:0x7a DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 4 // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..379b8a0b14f9ca7ecd9c4707e3e09d300cd2a4d8 --- /dev/null +++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source @@ -0,0 +1,972 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc215 = loc(unknown) +#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc222 = loc("in_out_ptr0"(#loc)) +#loc223 = loc("in_out_ptr1"(#loc)) +#loc224 = loc("in_ptr0"(#loc)) +#loc225 = loc("in_ptr1"(#loc)) +#loc226 = loc("in_ptr2"(#loc)) +#loc227 = loc("in_ptr3"(#loc)) +#loc228 = loc("in_ptr4"(#loc)) +#loc229 = loc("xnumel"(#loc)) +#loc230 = loc("r0_numel"(#loc)) +#loc432 = loc("input"(#loc213)) +#loc433 = loc("a"(#loc218)) +#loc434 = loc("b"(#loc218)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 73728 : i32 loc(#loc231) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc232) + %xoffset = tt.get_program_id x : i32 loc(#loc233) + %xoffset_2 = arith.constant 4 : i32 loc(#loc234) + %xoffset_3 = arith.constant 4 : i32 loc(#loc234) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc235) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc236) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc237) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc237) + %xmask = arith.constant true loc(#loc238) + %xmask_8 = arith.constant dense : tensor<4x64xi1> loc(#loc238) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc239) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc240) + %x0 = arith.constant 32 : i32 loc(#loc241) + %x0_10 = arith.constant 32 : i32 loc(#loc241) + %x0_11 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc241) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<4x1xi32> loc(#loc241) + %x1 = arith.constant 32 : i32 loc(#loc242) + %x1_13 = arith.constant 32 : i32 loc(#loc242) + %x1_14 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc242) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<4x1xi32> loc(#loc242) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc243) + %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244) + %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc244) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c64_i32 = arith.constant 64 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<4x64xf32>, tensor<4x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc246) + %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc246) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc247) + %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x64xi32> loc(#loc247) + %tmp0 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_27 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_28 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc248) + %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x64xi32> loc(#loc248) + %tmp0_30 = arith.constant 128 : i32 loc(#loc249) + %tmp0_31 = arith.constant 128 : i32 loc(#loc249) + %tmp0_32 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc249) + %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<4x1xi32> loc(#loc249) + %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc250) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc250) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<4x64xi32> loc(#loc250) + %tmp0_37 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_38 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_39 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc251) + %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<4x1xi32> loc(#loc251) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc252) + %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<4x64xi32> loc(#loc252) + %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc253) + %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc253) + %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254) + %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc254) + %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc254) + %tmp0_48 = arith.truncf %tmp0_47 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc254) + %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc254) + %tmp0_50 = arith.extf %tmp0_49 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc255) + %tmp6 = arith.constant 128 : i32 loc(#loc256) + %tmp6_51 = arith.constant 128 : i32 loc(#loc256) + %tmp6_52 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc256) + %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<4x1xi32> loc(#loc256) + %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc257) + %tmp6_55 = tt.broadcast %tmp6_53 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc257) + %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<4x64xi32> loc(#loc257) + %tmp6_57 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_58 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_59 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc258) + %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<4x1xi32> loc(#loc258) + %tmp6_61 = tt.broadcast %tmp6_60 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc259) + %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<4x64xi32> loc(#loc259) + %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc260) + %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc260) + %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261) + %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc261) + %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc261) + %tmp6_68 = arith.truncf %tmp6_67 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc261) + %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc261) + %tmp6_70 = arith.extf %tmp6_69 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc262) + %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<4x64xf32> loc(#loc263) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<4x64xf32> loc(#loc264) + %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc265) + %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc265) + %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<4x64xf32> loc(#loc266) + %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<4x64xf32> loc(#loc267) + %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc268) + %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc268) + scf.yield %_tmp4_72, %_tmp10_74 : tensor<4x64xf32>, tensor<4x64xf32> loc(#loc39) + } loc(#loc435) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<4x64xf32>) -> tensor<4xf32> loc(#loc269) + %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc270) + %tmp10 = tt.call @"triton.language.standard.sum__fp32S4_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<4x64xf32>) -> tensor<4xf32> loc(#loc271) + %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc272) + %c0_i32_21 = arith.constant 0 : i32 loc(#loc44) + %c64_i32_22 = arith.constant 64 : i32 loc(#loc44) + %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44) + %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44) + %6 = arith.bitcast %c64_i32_22 : i32 to i32 loc(#loc44) + %7 = ub.poison : i32 loc(#loc44) + scf.for %r0_offset = %4 to %5 step %6 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc273) + %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc273) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc274) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x64xi32> loc(#loc274) + %r0_3 = arith.constant 2 : i32 loc(#loc275) + %r0_3_25 = arith.constant 2 : i32 loc(#loc275) + %r0_3_26 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc275) + %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x64xi32> loc(#loc275) + %r0_4 = arith.constant 2 : i32 loc(#loc276) + %r0_4_28 = arith.constant 2 : i32 loc(#loc276) + %r0_4_29 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc276) + %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x64xi32> loc(#loc276) + %tmp50 = arith.constant 128 : i32 loc(#loc277) + %tmp50_31 = arith.constant 128 : i32 loc(#loc277) + %tmp50_32 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc277) + %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<4x1xi32> loc(#loc277) + %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc278) + %tmp50_35 = tt.broadcast %tmp50_33 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc278) + %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<4x64xi32> loc(#loc278) + %tmp50_37 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_38 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_39 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc279) + %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<4x1xi32> loc(#loc279) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc280) + %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<4x64xi32> loc(#loc280) + %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc281) + %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc281) + %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282) + %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc282) + %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc282) + %tmp50_48 = arith.truncf %tmp50_47 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc282) + %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc282) + %tmp50_50 = arith.extf %tmp50_49 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc283) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc284) + %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc284) + %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285) + %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc285) + %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc285) + %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc285) + %tmp58_56 = arith.extf %tmp58_55 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc286) + %tmp63 = arith.constant 128 : i32 loc(#loc287) + %tmp63_57 = arith.constant 128 : i32 loc(#loc287) + %tmp63_58 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc287) + %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<4x1xi32> loc(#loc287) + %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc288) + %tmp63_61 = tt.broadcast %tmp63_59 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc288) + %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<4x64xi32> loc(#loc288) + %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc289) + %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc289) + %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290) + %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc290) + %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc290) + %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc290) + %tmp66 = arith.constant 128 : i32 loc(#loc291) + %tmp66_69 = arith.constant 128 : i32 loc(#loc291) + %tmp66_70 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc291) + %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<4x1xi32> loc(#loc291) + %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc292) + %tmp66_73 = tt.broadcast %tmp66_71 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc292) + %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<4x64xi32> loc(#loc292) + %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc293) + %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc293) + %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294) + %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc294) + %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc294) + %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc294) + %tmp96 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_81 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_82 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc295) + %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x64xi32> loc(#loc295) + %tmp96_84 = arith.constant 128 : i32 loc(#loc296) + %tmp96_85 = arith.constant 128 : i32 loc(#loc296) + %tmp96_86 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc296) + %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<4x1xi32> loc(#loc296) + %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc297) + %tmp96_89 = tt.broadcast %tmp96_87 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc297) + %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<4x64xi32> loc(#loc297) + %tmp96_91 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_92 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_93 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc298) + %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<4x1xi32> loc(#loc298) + %tmp96_95 = tt.broadcast %tmp96_94 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc299) + %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<4x64xi32> loc(#loc299) + %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc300) + %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc300) + %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301) + %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc301) + %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc301) + %tmp96_102 = arith.truncf %tmp96_101 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc301) + %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<4x64x!tt.ptr> loc(#loc301) + %tmp96_104 = arith.extf %tmp96_103 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc302) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc303) + %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc303) + %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304) + %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc304) + %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc304) + %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc304) + %tmp102_110 = arith.extf %tmp102_109 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc305) + %tmp13 = arith.constant 0 : i64 loc(#loc306) + %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306) + %tmp14 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc307) + %tmp14_112 = arith.constant dense<0> : tensor<1x64xi64> loc(#loc307) + %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x64xi64> loc(#loc307) + %tmp15 = arith.constant 1 : i64 loc(#loc308) + %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308) + %tmp16 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc309) + %tmp16_115 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc309) + %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x64xi64> loc(#loc309) + %tmp17 = arith.constant 2 : i32 loc(#loc310) + %tmp17_117 = arith.constant 2 : i32 loc(#loc310) + %tmp17_118 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc310) + %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x64xi32> loc(#loc310) + %tmp17_120 = arith.constant 1 : i32 loc(#loc311) + %tmp17_121 = arith.constant 1 : i32 loc(#loc311) + %tmp17_122 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc311) + %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x64xi32> loc(#loc311) + %tmp17_124 = arith.constant 128 : i32 loc(#loc312) + %tmp17_125 = arith.constant 128 : i32 loc(#loc312) + %tmp17_126 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc312) + %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<4x1xi32> loc(#loc312) + %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc313) + %tmp17_129 = tt.broadcast %tmp17_127 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc313) + %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<4x64xi32> loc(#loc313) + %tmp17_131 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_132 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_133 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc314) + %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<4x1xi32> loc(#loc314) + %tmp17_135 = tt.broadcast %tmp17_134 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc315) + %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<4x64xi32> loc(#loc315) + %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc316) + %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc316) + %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc317) + %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318) + %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc318) + %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc318) + %tmp17_143 = arith.truncf %tmp17_142 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc318) + %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc318) + %tmp17_145 = arith.extf %tmp17_144 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc319) + %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320) + %tmp20 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc321) + %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<4x1xf32> loc(#loc321) + %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322) + %tmp22 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc323) + %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<4x1xf32> loc(#loc323) + %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc324) + %tmp24 = tt.broadcast %tmp23 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc325) + %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<4x64xf32> loc(#loc325) + %tmp25 = arith.constant 2 : i32 loc(#loc326) + %tmp25_149 = arith.constant 2 : i32 loc(#loc326) + %tmp25_150 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc326) + %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x64xi32> loc(#loc326) + %tmp25_152 = arith.constant 1 : i32 loc(#loc327) + %tmp25_153 = arith.constant 1 : i32 loc(#loc327) + %tmp25_154 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc327) + %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x64xi32> loc(#loc327) + %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc328) + %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc329) + %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc329) + %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc330) + %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331) + %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc331) + %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc331) + %tmp25_163 = arith.truncf %tmp25_162 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc331) + %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc331) + %tmp25_165 = arith.extf %tmp25_164 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc332) + %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<4x64xf32> loc(#loc333) + %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334) + %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc334) + %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<4x64xf32> loc(#loc334) + %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335) + %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc335) + %tmp31 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc336) + %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc336) + %tmp32 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc337) + %tmp32_170 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc337) + %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x64xi64> loc(#loc337) + %tmp33 = arith.constant 2 : i64 loc(#loc338) + %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338) + %tmp34 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc339) + %tmp34_173 = arith.constant dense<2> : tensor<1x64xi64> loc(#loc339) + %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x64xi64> loc(#loc339) + %tmp35 = arith.constant 2 : i32 loc(#loc340) + %tmp35_175 = arith.constant 2 : i32 loc(#loc340) + %tmp35_176 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc340) + %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x64xi32> loc(#loc340) + %tmp35_178 = arith.constant 128 : i32 loc(#loc341) + %tmp35_179 = arith.constant 128 : i32 loc(#loc341) + %tmp35_180 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc341) + %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<4x1xi32> loc(#loc341) + %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc342) + %tmp35_183 = tt.broadcast %tmp35_181 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc342) + %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<4x64xi32> loc(#loc342) + %tmp35_185 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_186 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_187 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc343) + %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<4x1xi32> loc(#loc343) + %tmp35_189 = tt.broadcast %tmp35_188 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc344) + %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<4x64xi32> loc(#loc344) + %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc345) + %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc345) + %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc346) + %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347) + %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc347) + %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc347) + %tmp35_197 = arith.truncf %tmp35_196 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc347) + %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc347) + %tmp35_199 = arith.extf %tmp35_198 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc348) + %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349) + %tmp38 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc350) + %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<4x1xf32> loc(#loc350) + %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351) + %tmp40 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc352) + %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<4x1xf32> loc(#loc352) + %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc353) + %tmp42 = tt.broadcast %tmp41 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc354) + %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<4x64xf32> loc(#loc354) + %tmp43 = arith.constant 2 : i32 loc(#loc355) + %tmp43_203 = arith.constant 2 : i32 loc(#loc355) + %tmp43_204 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc355) + %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x64xi32> loc(#loc355) + %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc356) + %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc357) + %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc357) + %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc358) + %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359) + %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc359) + %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc359) + %tmp43_213 = arith.truncf %tmp43_212 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc359) + %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc359) + %tmp43_215 = arith.extf %tmp43_214 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc360) + %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<4x64xf32> loc(#loc361) + %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362) + %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc362) + %tmp48 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc363) + %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc363) + %tmp49 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc364) + %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc364) + %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365) + %tmp53 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc366) + %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<4x1xf32> loc(#loc366) + %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367) + %tmp55 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc368) + %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<4x1xf32> loc(#loc368) + %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc369) + %tmp57 = tt.broadcast %tmp56 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc370) + %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<4x64xf32> loc(#loc370) + %tmp60 = tt.broadcast %tmp58_56 : tensor<1x64xf32> -> tensor<4x64xf32> loc(#loc371) + %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<4x64xf32> loc(#loc371) + %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<4x64xf32> loc(#loc372) + %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<4x64xf32> loc(#loc373) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<4x64xf32> loc(#loc374) + %tmp70 = arith.constant 2 : i32 loc(#loc375) + %tmp70_223 = arith.constant 2 : i32 loc(#loc375) + %tmp70_224 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc375) + %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x64xi32> loc(#loc375) + %tmp70_226 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_227 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_228 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc376) + %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x64xi32> loc(#loc376) + %tmp70_230 = arith.constant 128 : i32 loc(#loc377) + %tmp70_231 = arith.constant 128 : i32 loc(#loc377) + %tmp70_232 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc377) + %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<4x1xi32> loc(#loc377) + %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc378) + %tmp70_235 = tt.broadcast %tmp70_233 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc378) + %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<4x64xi32> loc(#loc378) + %tmp70_237 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_238 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_239 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc379) + %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<4x1xi32> loc(#loc379) + %tmp70_241 = tt.broadcast %tmp70_240 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc380) + %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<4x64xi32> loc(#loc380) + %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc381) + %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc381) + %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc382) + %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383) + %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc383) + %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc383) + %tmp70_249 = arith.truncf %tmp70_248 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc383) + %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc383) + %tmp70_251 = arith.extf %tmp70_250 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc384) + %tmp72 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc385) + %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<4x1xf32> loc(#loc385) + %tmp73 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc386) + %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<4x1xf32> loc(#loc386) + %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc387) + %tmp75 = tt.broadcast %tmp74 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc388) + %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<4x64xf32> loc(#loc388) + %tmp76 = arith.constant 2 : i32 loc(#loc389) + %tmp76_255 = arith.constant 2 : i32 loc(#loc389) + %tmp76_256 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc389) + %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x64xi32> loc(#loc389) + %tmp76_258 = arith.constant 1 : i32 loc(#loc390) + %tmp76_259 = arith.constant 1 : i32 loc(#loc390) + %tmp76_260 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc390) + %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x64xi32> loc(#loc390) + %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc391) + %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc392) + %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc392) + %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc393) + %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394) + %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc394) + %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc394) + %tmp76_269 = arith.truncf %tmp76_268 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc394) + %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc394) + %tmp76_271 = arith.extf %tmp76_270 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc395) + %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<4x64xf32> loc(#loc396) + %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397) + %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc397) + %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<4x64xf32> loc(#loc397) + %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398) + %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc398) + %tmp82 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc399) + %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc399) + %tmp83 = arith.constant 2 : i32 loc(#loc400) + %tmp83_276 = arith.constant 2 : i32 loc(#loc400) + %tmp83_277 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc400) + %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x64xi32> loc(#loc400) + %tmp83_279 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_280 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_281 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc401) + %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x64xi32> loc(#loc401) + %tmp83_283 = arith.constant 128 : i32 loc(#loc402) + %tmp83_284 = arith.constant 128 : i32 loc(#loc402) + %tmp83_285 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc402) + %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<4x1xi32> loc(#loc402) + %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc403) + %tmp83_288 = tt.broadcast %tmp83_286 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc403) + %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<4x64xi32> loc(#loc403) + %tmp83_290 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_291 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_292 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc404) + %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<4x1xi32> loc(#loc404) + %tmp83_294 = tt.broadcast %tmp83_293 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc405) + %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<4x64xi32> loc(#loc405) + %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc406) + %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc406) + %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc407) + %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408) + %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc408) + %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc408) + %tmp83_302 = arith.truncf %tmp83_301 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc408) + %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc408) + %tmp83_304 = arith.extf %tmp83_303 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc409) + %tmp85 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc410) + %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<4x1xf32> loc(#loc410) + %tmp86 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc411) + %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<4x1xf32> loc(#loc411) + %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc412) + %tmp88 = tt.broadcast %tmp87 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc413) + %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<4x64xf32> loc(#loc413) + %tmp89 = arith.constant 2 : i32 loc(#loc414) + %tmp89_308 = arith.constant 2 : i32 loc(#loc414) + %tmp89_309 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc414) + %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x64xi32> loc(#loc414) + %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc415) + %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc416) + %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc416) + %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc417) + %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418) + %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc418) + %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc418) + %tmp89_318 = arith.truncf %tmp89_317 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc418) + %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc418) + %tmp89_320 = arith.extf %tmp89_319 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc419) + %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<4x64xf32> loc(#loc420) + %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421) + %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc421) + %tmp94 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc422) + %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc422) + %tmp95 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc423) + %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc423) + %tmp98 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc424) + %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<4x1xf32> loc(#loc424) + %tmp99 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc425) + %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<4x1xf32> loc(#loc425) + %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc426) + %tmp101 = tt.broadcast %tmp100 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc427) + %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<4x64xf32> loc(#loc427) + %tmp104 = tt.broadcast %tmp102_110 : tensor<1x64xf32> -> tensor<4x64xf32> loc(#loc428) + %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<4x64xf32> loc(#loc428) + %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<4x64xf32> loc(#loc429) + %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<4x64xf32> loc(#loc430) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<4x64xf32> loc(#loc431) + %c128_i32 = arith.constant 128 : i32 loc(#loc204) + %c128_i32_328 = arith.constant 128 : i32 loc(#loc204) + %cst = arith.constant dense<128> : tensor<4x1xi32> loc(#loc204) + %8 = arith.muli %cst, %xindex_7 : tensor<4x1xi32> loc(#loc204) + %9 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc205) + %10 = tt.broadcast %8 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc205) + %11 = arith.addi %9, %10 : tensor<4x64xi32> loc(#loc205) + %12 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc206) + %13 = tt.addptr %12, %11 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc206) + %14 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc207) + %15 = arith.truncf %tmp68 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc207) + tt.store %13, %15, %14 : tensor<4x64x!tt.ptr> loc(#loc207) + %c128_i32_329 = arith.constant 128 : i32 loc(#loc208) + %c128_i32_330 = arith.constant 128 : i32 loc(#loc208) + %cst_331 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc208) + %16 = arith.muli %cst_331, %xindex_7 : tensor<4x1xi32> loc(#loc208) + %17 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc209) + %18 = tt.broadcast %16 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc209) + %19 = arith.addi %17, %18 : tensor<4x64xi32> loc(#loc209) + %20 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc210) + %21 = tt.addptr %20, %19 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc210) + %22 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc211) + %23 = arith.truncf %tmp110 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc211) + tt.store %21, %23, %22 : tensor<4x64x!tt.ptr> loc(#loc211) + } loc(#loc44) + tt.return loc(#loc212) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S4_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x64xf32> loc("input"(#loc213))) -> tensor<4xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214) + tt.reduce.return %2 : f32 loc(#loc214) + }) : (tensor<4x64xf32>) -> tensor<4xf32> loc(#loc214) + tt.return %0 : tensor<4xf32> loc(#loc216) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4xf32> loc(#loc217) + tt.return %1 : tensor<4xf32> loc(#loc217) + } loc(#loc213) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc219) + tt.return %0 : f32 loc(#loc220) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc221) + tt.return %1 : f32 loc(#loc221) + } loc(#loc218) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55) +#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66) +#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81) +#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57) +#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55) +#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63) +#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95) +#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42) +#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44) +#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55) +#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66) +#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81) +#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24) +#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24) +#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32) +#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53) +#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59) +#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91) +#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42) +#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24) +#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24) +#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33) +#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43) +#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39) +#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc231 = loc("xnumel"(#loc1)) +#loc232 = loc("r0_numel"(#loc2)) +#loc233 = loc("xoffset"(#loc3)) +#loc234 = loc("xoffset"(#loc4)) +#loc235 = loc("xindex"(#loc5)) +#loc236 = loc("xindex"(#loc6)) +#loc237 = loc("xindex"(#loc7)) +#loc238 = loc("xmask"(#loc8)) +#loc239 = loc("r0_base"(#loc9)) +#loc240 = loc("r0_base"(#loc10)) +#loc241 = loc("x0"(#loc11)) +#loc242 = loc("x1"(#loc12)) +#loc243 = loc("_tmp4"(#loc13)) +#loc244 = loc("_tmp10"(#loc14)) +#loc245 = loc("_tmp4"(#loc15)) +#loc246 = loc("r0_index"(#loc16)) +#loc247 = loc("r0_mask"(#loc17)) +#loc248 = loc("tmp0"(#loc18)) +#loc249 = loc("tmp0"(#loc19)) +#loc250 = loc("tmp0"(#loc20)) +#loc251 = loc("tmp0"(#loc21)) +#loc252 = loc("tmp0"(#loc22)) +#loc253 = loc("tmp0"(#loc23)) +#loc254 = loc("tmp0"(#loc24)) +#loc255 = loc("tmp0"(#loc25)) +#loc256 = loc("tmp6"(#loc26)) +#loc257 = loc("tmp6"(#loc27)) +#loc258 = loc("tmp6"(#loc28)) +#loc259 = loc("tmp6"(#loc29)) +#loc260 = loc("tmp6"(#loc30)) +#loc261 = loc("tmp6"(#loc31)) +#loc262 = loc("tmp6"(#loc32)) +#loc263 = loc("tmp2"(#loc33)) +#loc264 = loc("tmp5"(#loc34)) +#loc265 = loc("_tmp4"(#loc35)) +#loc266 = loc("tmp8"(#loc36)) +#loc267 = loc("tmp11"(#loc37)) +#loc268 = loc("_tmp10"(#loc38)) +#loc269 = loc("tmp4"(#loc40)) +#loc270 = loc("tmp4"(#loc41)) +#loc271 = loc("tmp10"(#loc42)) +#loc272 = loc("tmp10"(#loc43)) +#loc273 = loc("r0_index"(#loc45)) +#loc274 = loc("r0_mask"(#loc46)) +#loc275 = loc("r0_3"(#loc47)) +#loc276 = loc("r0_4"(#loc48)) +#loc277 = loc("tmp50"(#loc49)) +#loc278 = loc("tmp50"(#loc50)) +#loc279 = loc("tmp50"(#loc51)) +#loc280 = loc("tmp50"(#loc52)) +#loc281 = loc("tmp50"(#loc53)) +#loc282 = loc("tmp50"(#loc54)) +#loc283 = loc("tmp50"(#loc55)) +#loc284 = loc("tmp58"(#loc56)) +#loc285 = loc("tmp58"(#loc57)) +#loc286 = loc("tmp58"(#loc58)) +#loc287 = loc("tmp63"(#loc59)) +#loc288 = loc("tmp63"(#loc60)) +#loc289 = loc("tmp63"(#loc61)) +#loc290 = loc("tmp63"(#loc62)) +#loc291 = loc("tmp66"(#loc63)) +#loc292 = loc("tmp66"(#loc64)) +#loc293 = loc("tmp66"(#loc65)) +#loc294 = loc("tmp66"(#loc66)) +#loc295 = loc("tmp96"(#loc67)) +#loc296 = loc("tmp96"(#loc68)) +#loc297 = loc("tmp96"(#loc69)) +#loc298 = loc("tmp96"(#loc70)) +#loc299 = loc("tmp96"(#loc71)) +#loc300 = loc("tmp96"(#loc72)) +#loc301 = loc("tmp96"(#loc73)) +#loc302 = loc("tmp96"(#loc74)) +#loc303 = loc("tmp102"(#loc75)) +#loc304 = loc("tmp102"(#loc76)) +#loc305 = loc("tmp102"(#loc77)) +#loc306 = loc("tmp13"(#loc78)) +#loc307 = loc("tmp14"(#loc79)) +#loc308 = loc("tmp15"(#loc80)) +#loc309 = loc("tmp16"(#loc81)) +#loc310 = loc("tmp17"(#loc82)) +#loc311 = loc("tmp17"(#loc83)) +#loc312 = loc("tmp17"(#loc84)) +#loc313 = loc("tmp17"(#loc85)) +#loc314 = loc("tmp17"(#loc86)) +#loc315 = loc("tmp17"(#loc87)) +#loc316 = loc("tmp17"(#loc88)) +#loc317 = loc("tmp17"(#loc89)) +#loc318 = loc("tmp17"(#loc90)) +#loc319 = loc("tmp17"(#loc91)) +#loc320 = loc("tmp19"(#loc92)) +#loc321 = loc("tmp20"(#loc93)) +#loc322 = loc("tmp21"(#loc94)) +#loc323 = loc("tmp22"(#loc95)) +#loc324 = loc("tmp23"(#loc96)) +#loc325 = loc("tmp24"(#loc97)) +#loc326 = loc("tmp25"(#loc98)) +#loc327 = loc("tmp25"(#loc99)) +#loc328 = loc("tmp25"(#loc100)) +#loc329 = loc("tmp25"(#loc101)) +#loc330 = loc("tmp25"(#loc102)) +#loc331 = loc("tmp25"(#loc103)) +#loc332 = loc("tmp25"(#loc104)) +#loc333 = loc("tmp27"(#loc105)) +#loc334 = loc("tmp29"(#loc106)) +#loc335 = loc("tmp30"(#loc107)) +#loc336 = loc("tmp31"(#loc108)) +#loc337 = loc("tmp32"(#loc109)) +#loc338 = loc("tmp33"(#loc110)) +#loc339 = loc("tmp34"(#loc111)) +#loc340 = loc("tmp35"(#loc112)) +#loc341 = loc("tmp35"(#loc113)) +#loc342 = loc("tmp35"(#loc114)) +#loc343 = loc("tmp35"(#loc115)) +#loc344 = loc("tmp35"(#loc116)) +#loc345 = loc("tmp35"(#loc117)) +#loc346 = loc("tmp35"(#loc118)) +#loc347 = loc("tmp35"(#loc119)) +#loc348 = loc("tmp35"(#loc120)) +#loc349 = loc("tmp37"(#loc121)) +#loc350 = loc("tmp38"(#loc122)) +#loc351 = loc("tmp39"(#loc123)) +#loc352 = loc("tmp40"(#loc124)) +#loc353 = loc("tmp41"(#loc125)) +#loc354 = loc("tmp42"(#loc126)) +#loc355 = loc("tmp43"(#loc127)) +#loc356 = loc("tmp43"(#loc128)) +#loc357 = loc("tmp43"(#loc129)) +#loc358 = loc("tmp43"(#loc130)) +#loc359 = loc("tmp43"(#loc131)) +#loc360 = loc("tmp43"(#loc132)) +#loc361 = loc("tmp45"(#loc133)) +#loc362 = loc("tmp47"(#loc134)) +#loc363 = loc("tmp48"(#loc135)) +#loc364 = loc("tmp49"(#loc136)) +#loc365 = loc("tmp52"(#loc137)) +#loc366 = loc("tmp53"(#loc138)) +#loc367 = loc("tmp54"(#loc139)) +#loc368 = loc("tmp55"(#loc140)) +#loc369 = loc("tmp56"(#loc141)) +#loc370 = loc("tmp57"(#loc142)) +#loc371 = loc("tmp60"(#loc143)) +#loc372 = loc("tmp64"(#loc144)) +#loc373 = loc("tmp67"(#loc145)) +#loc374 = loc("tmp68"(#loc146)) +#loc375 = loc("tmp70"(#loc147)) +#loc376 = loc("tmp70"(#loc148)) +#loc377 = loc("tmp70"(#loc149)) +#loc378 = loc("tmp70"(#loc150)) +#loc379 = loc("tmp70"(#loc151)) +#loc380 = loc("tmp70"(#loc152)) +#loc381 = loc("tmp70"(#loc153)) +#loc382 = loc("tmp70"(#loc154)) +#loc383 = loc("tmp70"(#loc155)) +#loc384 = loc("tmp70"(#loc156)) +#loc385 = loc("tmp72"(#loc157)) +#loc386 = loc("tmp73"(#loc158)) +#loc387 = loc("tmp74"(#loc159)) +#loc388 = loc("tmp75"(#loc160)) +#loc389 = loc("tmp76"(#loc161)) +#loc390 = loc("tmp76"(#loc162)) +#loc391 = loc("tmp76"(#loc163)) +#loc392 = loc("tmp76"(#loc164)) +#loc393 = loc("tmp76"(#loc165)) +#loc394 = loc("tmp76"(#loc166)) +#loc395 = loc("tmp76"(#loc167)) +#loc396 = loc("tmp78"(#loc168)) +#loc397 = loc("tmp80"(#loc169)) +#loc398 = loc("tmp81"(#loc170)) +#loc399 = loc("tmp82"(#loc171)) +#loc400 = loc("tmp83"(#loc172)) +#loc401 = loc("tmp83"(#loc173)) +#loc402 = loc("tmp83"(#loc174)) +#loc403 = loc("tmp83"(#loc175)) +#loc404 = loc("tmp83"(#loc176)) +#loc405 = loc("tmp83"(#loc177)) +#loc406 = loc("tmp83"(#loc178)) +#loc407 = loc("tmp83"(#loc179)) +#loc408 = loc("tmp83"(#loc180)) +#loc409 = loc("tmp83"(#loc181)) +#loc410 = loc("tmp85"(#loc182)) +#loc411 = loc("tmp86"(#loc183)) +#loc412 = loc("tmp87"(#loc184)) +#loc413 = loc("tmp88"(#loc185)) +#loc414 = loc("tmp89"(#loc186)) +#loc415 = loc("tmp89"(#loc187)) +#loc416 = loc("tmp89"(#loc188)) +#loc417 = loc("tmp89"(#loc189)) +#loc418 = loc("tmp89"(#loc190)) +#loc419 = loc("tmp89"(#loc191)) +#loc420 = loc("tmp91"(#loc192)) +#loc421 = loc("tmp93"(#loc193)) +#loc422 = loc("tmp94"(#loc194)) +#loc423 = loc("tmp95"(#loc195)) +#loc424 = loc("tmp98"(#loc196)) +#loc425 = loc("tmp99"(#loc197)) +#loc426 = loc("tmp100"(#loc198)) +#loc427 = loc("tmp101"(#loc199)) +#loc428 = loc("tmp104"(#loc200)) +#loc429 = loc("tmp107"(#loc201)) +#loc430 = loc("tmp109"(#loc202)) +#loc431 = loc("tmp110"(#loc203)) +#loc435 = loc("_tmp10"(#loc245)) diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8115df19745abae3957c66ab1adb50400f930743 --- /dev/null +++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir @@ -0,0 +1,547 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc147 = loc("in_out_ptr0"(#loc)) +#loc148 = loc("in_out_ptr1"(#loc)) +#loc149 = loc("in_ptr0"(#loc)) +#loc150 = loc("in_ptr1"(#loc)) +#loc151 = loc("in_ptr2"(#loc)) +#loc152 = loc("in_ptr3"(#loc)) +#loc153 = loc("in_ptr4"(#loc)) +#loc154 = loc("xnumel"(#loc)) +#loc155 = loc("r0_numel"(#loc)) +#loc185 = loc("tmp4"(#loc33)) +#loc187 = loc("tmp10"(#loc36)) +#loc292 = loc(callsite(#loc1 at #loc185)) +#loc294 = loc(callsite(#loc1 at #loc187)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4097> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x64xbf16, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x64xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<2> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<36864> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<36864> : tensor<4x1xi32, #blocked1> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<4x1xi32, #blocked1> loc(#loc1) + %cst_8 = arith.constant dense<4096> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<32> : tensor<4x1xi32, #blocked1> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<4x64xbf16, #blocked1> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<4x64xbf16, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_16 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32, #blocked1> loc(#loc1) + %cst_17 = arith.constant dense<1.280000e+02> : tensor<4x1xf32, #blocked1> loc(#loc1) + %cst_18 = arith.constant dense<0.000000e+00> : tensor<4x64xf32, #blocked> loc(#loc1) + %cst_19 = arith.constant dense<0.000000e+00> : tensor<4x64xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc156) + %xoffset_20 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc157) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158) + %xindex_21 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158) + %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc158) + %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc158) + %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc159) + %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<4x1xi32, #blocked> loc(#loc159) + %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<4x1xi32, #blocked1> loc(#loc159) + %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<4x1xi32, #blocked> loc(#loc159) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160) + %r0_base_28 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160) + %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc160) + %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc160) + %x0 = arith.remsi %xindex_26, %cst_13 : tensor<4x1xi32, #blocked1> loc(#loc161) + %x0_31 = arith.remsi %xindex_27, %cst_12 : tensor<4x1xi32, #blocked> loc(#loc161) + %x1 = arith.divsi %xindex_26, %cst_13 : tensor<4x1xi32, #blocked1> loc(#loc162) + %x1_32 = arith.divsi %xindex_27, %cst_12 : tensor<4x1xi32, #blocked> loc(#loc162) + %tmp0 = arith.muli %x0, %cst_7 : tensor<4x1xi32, #blocked1> loc(#loc163) + %tmp0_33 = tt.broadcast %tmp0 : tensor<4x1xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc164) + %tmp0_34 = arith.muli %x1, %cst_5 : tensor<4x1xi32, #blocked1> loc(#loc165) + %tmp0_35 = tt.broadcast %tmp0_34 : tensor<4x1xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc166) + %tmp0_36 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr, #blocked1> loc(#loc167) + %_tmp10:2 = scf.for %_tmp10_51 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg10 = %cst_19, %arg11 = %cst_19) -> (tensor<4x64xf32, #blocked1>, tensor<4x64xf32, #blocked1>) : i32 { + %r0_index = tt.splat %_tmp10_51 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc169) + %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc169) + %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc170) + %tmp0_53 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc171) + %tmp0_54 = tt.broadcast %tmp0_53 : tensor<1x64xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc164) + %tmp0_55 = arith.addi %tmp0_54, %tmp0_33 : tensor<4x64xi32, #blocked1> loc(#loc164) + %tmp0_56 = arith.addi %tmp0_55, %tmp0_35 : tensor<4x64xi32, #blocked1> loc(#loc166) + %tmp0_57 = tt.addptr %tmp0_36, %tmp0_56 : tensor<4x64x!tt.ptr, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc167) + %tmp0_58 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<4x64xi1, #blocked1> loc(#loc172) + %tmp0_59 = tt.load %tmp0_57, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked1> loc(#loc172) + %tmp0_60 = arith.extf %tmp0_59 : tensor<4x64xbf16, #blocked1> to tensor<4x64xf32, #blocked1> loc(#loc173) + %tmp6 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc174) + %tmp6_61 = arith.addi %tmp6, %tmp0_33 : tensor<4x64xi32, #blocked1> loc(#loc174) + %tmp6_62 = arith.addi %tmp6_61, %tmp0_35 : tensor<4x64xi32, #blocked1> loc(#loc175) + %tmp6_63 = tt.addptr %tmp0_36, %tmp6_62 : tensor<4x64x!tt.ptr, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc176) + %tmp6_64 = tt.load %tmp6_63, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked1> loc(#loc177) + %tmp6_65 = arith.extf %tmp6_64 : tensor<4x64xbf16, #blocked1> to tensor<4x64xf32, #blocked1> loc(#loc178) + %tmp2 = arith.mulf %tmp0_60, %tmp0_60 : tensor<4x64xf32, #blocked1> loc(#loc179) + %tmp5 = arith.addf %arg10, %tmp2 : tensor<4x64xf32, #blocked1> loc(#loc180) + %_tmp4 = arith.select %tmp0_58, %tmp5, %arg10 : tensor<4x64xi1, #blocked1>, tensor<4x64xf32, #blocked1> loc(#loc181) + %tmp8 = arith.mulf %tmp6_65, %tmp6_65 : tensor<4x64xf32, #blocked1> loc(#loc182) + %tmp11 = arith.addf %arg11, %tmp8 : tensor<4x64xf32, #blocked1> loc(#loc183) + %_tmp10_66 = arith.select %tmp0_58, %tmp11, %arg11 : tensor<4x64xi1, #blocked1>, tensor<4x64xf32, #blocked1> loc(#loc184) + scf.yield %_tmp4, %_tmp10_66 : tensor<4x64xf32, #blocked1>, tensor<4x64xf32, #blocked1> loc(#loc31) + } loc(#loc290) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_51: f32 loc(callsite(#loc1 at #loc185)), %tmp4_52: f32 loc(callsite(#loc1 at #loc185))): + %tmp4_53 = arith.addf %tmp4_51, %tmp4_52 : f32 loc(#loc297) + tt.reduce.return %tmp4_53 : f32 loc(#loc291) + }) : (tensor<4x64xf32, #blocked1>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291) + %tmp4_37 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc186) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_51: f32 loc(callsite(#loc1 at #loc187)), %tmp10_52: f32 loc(callsite(#loc1 at #loc187))): + %tmp10_53 = arith.addf %tmp10_51, %tmp10_52 : f32 loc(#loc298) + tt.reduce.return %tmp10_53 : f32 loc(#loc293) + }) : (tensor<4x64xf32, #blocked1>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293) + %tmp10_38 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc188) + %tmp50 = arith.muli %x0_31, %cst_6 : tensor<4x1xi32, #blocked> loc(#loc189) + %tmp50_39 = tt.broadcast %tmp50 : tensor<4x1xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc190) + %tmp50_40 = arith.muli %x1_32, %cst_4 : tensor<4x1xi32, #blocked> loc(#loc191) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<4x1xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc192) + %tmp50_42 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr, #blocked> loc(#loc193) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked> loc(#loc194) + %tmp58_43 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc194) + %tmp63 = arith.muli %x1, %cst_7 : tensor<4x1xi32, #blocked1> loc(#loc195) + %tmp63_44 = tt.broadcast %tmp63 : tensor<4x1xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc196) + %tmp63_45 = tt.splat %in_ptr2 : !tt.ptr -> tensor<4x64x!tt.ptr, #blocked1> loc(#loc197) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<4x64x!tt.ptr, #blocked1> loc(#loc198) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked> loc(#loc199) + %tmp102_46 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc199) + %tmp20 = arith.divf %tmp10_38, %cst_17 : tensor<4x1xf32, #blocked1> loc(#loc200) + %tmp22 = arith.addf %tmp20, %cst_16 : tensor<4x1xf32, #blocked1> loc(#loc201) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32, #blocked1>) -> tensor<4x1xf32, #blocked1> loc(#loc202) + %tmp24 = ttg.convert_layout %tmp23 : tensor<4x1xf32, #blocked1> -> tensor<4x1xf32, #blocked> loc(#loc203) + %tmp24_47 = tt.broadcast %tmp24 : tensor<4x1xf32, #blocked> -> tensor<4x64xf32, #blocked> loc(#loc203) + %tmp24_48 = tt.broadcast %tmp23 : tensor<4x1xf32, #blocked1> -> tensor<4x64xf32, #blocked1> loc(#loc203) + %tmp72 = arith.divf %tmp4_37, %cst_17 : tensor<4x1xf32, #blocked1> loc(#loc204) + %tmp73 = arith.addf %tmp72, %cst_16 : tensor<4x1xf32, #blocked1> loc(#loc205) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32, #blocked1>) -> tensor<4x1xf32, #blocked1> loc(#loc206) + %tmp75 = ttg.convert_layout %tmp74 : tensor<4x1xf32, #blocked1> -> tensor<4x1xf32, #blocked> loc(#loc207) + %tmp75_49 = tt.broadcast %tmp75 : tensor<4x1xf32, #blocked> -> tensor<4x64xf32, #blocked> loc(#loc207) + %tmp75_50 = tt.broadcast %tmp74 : tensor<4x1xf32, #blocked1> -> tensor<4x64xf32, #blocked1> loc(#loc207) + %0 = arith.muli %xindex_26, %cst_7 : tensor<4x1xi32, #blocked1> loc(#loc57) + %1 = tt.broadcast %0 : tensor<4x1xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc58) + %2 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr, #blocked1> loc(#loc59) + %3 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<4x64x!tt.ptr, #blocked1> loc(#loc60) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked1> loc(#loc208) + %r0_index_51 = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked> loc(#loc208) + %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc208) + %r0_index_53 = arith.addi %r0_index_51, %r0_base_30 : tensor<1x64xi32, #blocked> loc(#loc208) + %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc209) + %r0_mask_54 = arith.cmpi slt, %r0_index_53, %cst_10 : tensor<1x64xi32, #blocked> loc(#loc209) + %r0_3 = arith.remsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc210) + %r0_4 = arith.divsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc211) + %tmp50_55 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc190) + %tmp50_56 = arith.addi %tmp50_55, %tmp0_33 : tensor<4x64xi32, #blocked1> loc(#loc190) + %tmp50_57 = arith.addi %tmp50_56, %tmp0_35 : tensor<4x64xi32, #blocked1> loc(#loc192) + %tmp50_58 = tt.addptr %tmp0_36, %tmp50_57 : tensor<4x64x!tt.ptr, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc193) + %tmp50_59 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<4x64xi1, #blocked1> loc(#loc212) + %tmp50_60 = tt.load %tmp50_58, %tmp50_59, %cst_14 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked1> loc(#loc212) + %tmp50_61 = arith.extf %tmp50_60 : tensor<4x64xbf16, #blocked1> to tensor<4x64xf32, #blocked1> loc(#loc213) + %tmp58_62 = tt.addptr %tmp58_43, %r0_index_52 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc194) + %tmp58_63 = tt.load %tmp58_62, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr, #blocked1> loc(#loc214) + %tmp58_64 = arith.extf %tmp58_63 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc215) + %tmp63_65 = arith.addi %tmp50_55, %tmp63_44 : tensor<4x64xi32, #blocked1> loc(#loc196) + %tmp63_66 = tt.addptr %tmp63_45, %tmp63_65 : tensor<4x64x!tt.ptr, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc197) + %tmp63_67 = tt.load %tmp63_66, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked1> loc(#loc216) + %tmp66_68 = tt.addptr %tmp66, %tmp63_65 : tensor<4x64x!tt.ptr, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc198) + %tmp66_69 = tt.load %tmp66_68, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked1> loc(#loc217) + %tmp66_70 = ttg.convert_layout %tmp66_69 : tensor<4x64xf32, #blocked1> -> tensor<4x64xf32, #blocked> loc(#loc217) + %tmp96 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc218) + %tmp96_71 = tt.broadcast %tmp96 : tensor<1x64xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc219) + %tmp96_72 = arith.addi %tmp96_71, %tmp0_33 : tensor<4x64xi32, #blocked1> loc(#loc219) + %tmp96_73 = arith.addi %tmp96_72, %tmp0_35 : tensor<4x64xi32, #blocked1> loc(#loc220) + %tmp96_74 = tt.addptr %tmp0_36, %tmp96_73 : tensor<4x64x!tt.ptr, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc221) + %tmp96_75 = tt.load %tmp96_74, %tmp50_59, %cst_14 evictionPolicy = evict_first : tensor<4x64x!tt.ptr, #blocked1> loc(#loc222) + %tmp96_76 = arith.extf %tmp96_75 : tensor<4x64xbf16, #blocked1> to tensor<4x64xf32, #blocked1> loc(#loc223) + %tmp102_77 = tt.addptr %tmp102_46, %r0_index_52 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc199) + %tmp102_78 = tt.load %tmp102_77, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr, #blocked1> loc(#loc224) + %tmp102_79 = arith.extf %tmp102_78 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc225) + %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc226) + %tmp16_80 = arith.cmpi slt, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc226) + %tmp17 = arith.muli %r0_4, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc227) + %tmp17_81 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32, #blocked> loc(#loc228) + %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x64xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc229) + %tmp17_83 = arith.addi %tmp17_82, %tmp50_39 : tensor<4x64xi32, #blocked> loc(#loc229) + %tmp17_84 = arith.addi %tmp17_83, %tmp50_41 : tensor<4x64xi32, #blocked> loc(#loc230) + %tmp17_85 = tt.addptr %tmp50_42, %tmp17_84 : tensor<4x64x!tt.ptr, #blocked>, tensor<4x64xi32, #blocked> loc(#loc231) + %tmp17_86 = arith.andi %r0_mask_54, %tmp16_80 : tensor<1x64xi1, #blocked> loc(#loc232) + %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x64xi1, #blocked> -> tensor<4x64xi1, #blocked> loc(#loc233) + %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked> loc(#loc233) + %tmp17_89 = arith.extf %tmp17_88 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc234) + %tmp24_90 = arith.mulf %tmp17_89, %tmp24_47 : tensor<4x64xf32, #blocked> loc(#loc203) + %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc235) + %tmp25_91 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr, #blocked> -> tensor<4x64x!tt.ptr, #blocked> loc(#loc235) + %tmp25_92 = tt.load %tmp25_91, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked> loc(#loc236) + %tmp25_93 = arith.extf %tmp25_92 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc237) + %tmp27 = arith.mulf %tmp24_90, %tmp25_93 : tensor<4x64xf32, #blocked> loc(#loc238) + %tmp29 = arith.subf %cst_18, %tmp27 : tensor<4x64xf32, #blocked> loc(#loc239) + %tmp31 = tt.broadcast %tmp16_80 : tensor<1x64xi1, #blocked> -> tensor<4x64xi1, #blocked> loc(#loc240) + %tmp32 = arith.cmpi sge, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc241) + %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc242) + %tmp35_94 = arith.addi %tmp35, %tmp50_39 : tensor<4x64xi32, #blocked> loc(#loc242) + %tmp35_95 = arith.addi %tmp35_94, %tmp50_41 : tensor<4x64xi32, #blocked> loc(#loc243) + %tmp35_96 = tt.addptr %tmp50_42, %tmp35_95 : tensor<4x64x!tt.ptr, #blocked>, tensor<4x64xi32, #blocked> loc(#loc244) + %tmp35_97 = arith.andi %r0_mask_54, %tmp32 : tensor<1x64xi1, #blocked> loc(#loc245) + %tmp35_98 = tt.broadcast %tmp35_97 : tensor<1x64xi1, #blocked> -> tensor<4x64xi1, #blocked> loc(#loc246) + %tmp35_99 = tt.load %tmp35_96, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked> loc(#loc246) + %tmp35_100 = arith.extf %tmp35_99 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc247) + %tmp42 = arith.mulf %tmp35_100, %tmp24_47 : tensor<4x64xf32, #blocked> loc(#loc248) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc249) + %tmp43_101 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr, #blocked> -> tensor<4x64x!tt.ptr, #blocked> loc(#loc249) + %tmp43_102 = tt.load %tmp43_101, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked> loc(#loc250) + %tmp43_103 = arith.extf %tmp43_102 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc251) + %tmp45 = arith.mulf %tmp42, %tmp43_103 : tensor<4x64xf32, #blocked> loc(#loc252) + %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1, #blocked> -> tensor<4x64xi1, #blocked> loc(#loc253) + %tmp48_104 = arith.select %tmp48, %tmp45, %cst_18 : tensor<4x64xi1, #blocked>, tensor<4x64xf32, #blocked> loc(#loc253) + %tmp49 = arith.select %tmp31, %tmp29, %tmp48_104 : tensor<4x64xi1, #blocked>, tensor<4x64xf32, #blocked> loc(#loc295) + %tmp57 = arith.mulf %tmp50_61, %tmp24_48 : tensor<4x64xf32, #blocked1> loc(#loc255) + %tmp60 = tt.broadcast %tmp58_64 : tensor<1x64xf32, #blocked1> -> tensor<4x64xf32, #blocked1> loc(#loc256) + %tmp60_105 = arith.mulf %tmp57, %tmp60 : tensor<4x64xf32, #blocked1> loc(#loc256) + %tmp64 = arith.mulf %tmp60_105, %tmp63_67 : tensor<4x64xf32, #blocked1> loc(#loc257) + %tmp64_106 = ttg.convert_layout %tmp64 : tensor<4x64xf32, #blocked1> -> tensor<4x64xf32, #blocked> loc(#loc257) + %tmp67 = arith.mulf %tmp49, %tmp66_70 : tensor<4x64xf32, #blocked> loc(#loc258) + %tmp68 = arith.addf %tmp64_106, %tmp67 : tensor<4x64xf32, #blocked> loc(#loc259) + %tmp70 = arith.addi %tmp17, %cst : tensor<1x64xi32, #blocked> loc(#loc260) + %tmp70_107 = tt.broadcast %tmp70 : tensor<1x64xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc261) + %tmp70_108 = arith.addi %tmp70_107, %tmp50_39 : tensor<4x64xi32, #blocked> loc(#loc261) + %tmp70_109 = arith.addi %tmp70_108, %tmp50_41 : tensor<4x64xi32, #blocked> loc(#loc262) + %tmp70_110 = tt.addptr %tmp50_42, %tmp70_109 : tensor<4x64x!tt.ptr, #blocked>, tensor<4x64xi32, #blocked> loc(#loc263) + %tmp70_111 = tt.load %tmp70_110, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked> loc(#loc264) + %tmp70_112 = arith.extf %tmp70_111 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc265) + %tmp75_113 = arith.mulf %tmp70_112, %tmp75_49 : tensor<4x64xf32, #blocked> loc(#loc207) + %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc266) + %tmp76_114 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr, #blocked> -> tensor<4x64x!tt.ptr, #blocked> loc(#loc266) + %tmp76_115 = tt.load %tmp76_114, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked> loc(#loc267) + %tmp76_116 = arith.extf %tmp76_115 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc268) + %tmp78 = arith.mulf %tmp75_113, %tmp76_116 : tensor<4x64xf32, #blocked> loc(#loc269) + %tmp80 = arith.subf %cst_18, %tmp78 : tensor<4x64xf32, #blocked> loc(#loc270) + %tmp83 = arith.addi %tmp17, %cst_8 : tensor<1x64xi32, #blocked> loc(#loc271) + %tmp83_117 = tt.broadcast %tmp83 : tensor<1x64xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc272) + %tmp83_118 = arith.addi %tmp83_117, %tmp50_39 : tensor<4x64xi32, #blocked> loc(#loc272) + %tmp83_119 = arith.addi %tmp83_118, %tmp50_41 : tensor<4x64xi32, #blocked> loc(#loc273) + %tmp83_120 = tt.addptr %tmp50_42, %tmp83_119 : tensor<4x64x!tt.ptr, #blocked>, tensor<4x64xi32, #blocked> loc(#loc274) + %tmp83_121 = tt.load %tmp83_120, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked> loc(#loc275) + %tmp83_122 = arith.extf %tmp83_121 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc276) + %tmp88 = arith.mulf %tmp83_122, %tmp75_49 : tensor<4x64xf32, #blocked> loc(#loc277) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc278) + %tmp89_123 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr, #blocked> -> tensor<4x64x!tt.ptr, #blocked> loc(#loc278) + %tmp89_124 = tt.load %tmp89_123, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr, #blocked> loc(#loc279) + %tmp89_125 = arith.extf %tmp89_124 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc280) + %tmp91 = arith.mulf %tmp88, %tmp89_125 : tensor<4x64xf32, #blocked> loc(#loc281) + %tmp94 = arith.select %tmp48, %tmp91, %cst_18 : tensor<4x64xi1, #blocked>, tensor<4x64xf32, #blocked> loc(#loc282) + %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<4x64xi1, #blocked>, tensor<4x64xf32, #blocked> loc(#loc296) + %tmp101 = arith.mulf %tmp96_76, %tmp75_50 : tensor<4x64xf32, #blocked1> loc(#loc285) + %tmp104 = tt.broadcast %tmp102_79 : tensor<1x64xf32, #blocked1> -> tensor<4x64xf32, #blocked1> loc(#loc286) + %tmp104_126 = arith.mulf %tmp101, %tmp104 : tensor<4x64xf32, #blocked1> loc(#loc286) + %tmp107 = arith.mulf %tmp104_126, %tmp63_67 : tensor<4x64xf32, #blocked1> loc(#loc287) + %tmp107_127 = ttg.convert_layout %tmp107 : tensor<4x64xf32, #blocked1> -> tensor<4x64xf32, #blocked> loc(#loc287) + %tmp109 = arith.mulf %tmp95, %tmp66_70 : tensor<4x64xf32, #blocked> loc(#loc288) + %tmp110 = arith.addf %tmp107_127, %tmp109 : tensor<4x64xf32, #blocked> loc(#loc289) + %4 = arith.addi %tmp50_55, %1 : tensor<4x64xi32, #blocked1> loc(#loc58) + %5 = tt.addptr %2, %4 : tensor<4x64x!tt.ptr, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc59) + %6 = arith.truncf %tmp68 : tensor<4x64xf32, #blocked> to tensor<4x64xbf16, #blocked> loc(#loc144) + %7 = ttg.convert_layout %6 : tensor<4x64xbf16, #blocked> -> tensor<4x64xbf16, #blocked1> loc(#loc144) + tt.store %5, %7, %tmp50_59 : tensor<4x64x!tt.ptr, #blocked1> loc(#loc144) + %8 = tt.addptr %3, %4 : tensor<4x64x!tt.ptr, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc60) + %9 = arith.truncf %tmp110 : tensor<4x64xf32, #blocked> to tensor<4x64xbf16, #blocked> loc(#loc145) + %10 = ttg.convert_layout %9 : tensor<4x64xbf16, #blocked> -> tensor<4x64xbf16, #blocked1> loc(#loc145) + tt.store %8, %10, %tmp50_59 : tensor<4x64x!tt.ptr, #blocked1> loc(#loc145) + } loc(#loc61) + tt.return loc(#loc146) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc156 = loc("xoffset"(#loc2)) +#loc157 = loc("xoffset"(#loc3)) +#loc158 = loc("xindex"(#loc4)) +#loc159 = loc("xindex"(#loc5)) +#loc160 = loc("r0_base"(#loc6)) +#loc161 = loc("x0"(#loc7)) +#loc162 = loc("x1"(#loc8)) +#loc163 = loc("tmp0"(#loc9)) +#loc164 = loc("tmp0"(#loc10)) +#loc165 = loc("tmp0"(#loc11)) +#loc166 = loc("tmp0"(#loc12)) +#loc167 = loc("tmp0"(#loc13)) +#loc168 = loc("_tmp4"(#loc14)) +#loc169 = loc("r0_index"(#loc15)) +#loc170 = loc("r0_mask"(#loc16)) +#loc171 = loc("tmp0"(#loc17)) +#loc172 = loc("tmp0"(#loc18)) +#loc173 = loc("tmp0"(#loc19)) +#loc174 = loc("tmp6"(#loc20)) +#loc175 = loc("tmp6"(#loc21)) +#loc176 = loc("tmp6"(#loc22)) +#loc177 = loc("tmp6"(#loc23)) +#loc178 = loc("tmp6"(#loc24)) +#loc179 = loc("tmp2"(#loc25)) +#loc180 = loc("tmp5"(#loc26)) +#loc181 = loc("_tmp4"(#loc27)) +#loc182 = loc("tmp8"(#loc28)) +#loc183 = loc("tmp11"(#loc29)) +#loc184 = loc("_tmp10"(#loc30)) +#loc186 = loc("tmp4"(#loc35)) +#loc188 = loc("tmp10"(#loc37)) +#loc189 = loc("tmp50"(#loc38)) +#loc190 = loc("tmp50"(#loc39)) +#loc191 = loc("tmp50"(#loc40)) +#loc192 = loc("tmp50"(#loc41)) +#loc193 = loc("tmp50"(#loc42)) +#loc194 = loc("tmp58"(#loc43)) +#loc195 = loc("tmp63"(#loc44)) +#loc196 = loc("tmp63"(#loc45)) +#loc197 = loc("tmp63"(#loc46)) +#loc198 = loc("tmp66"(#loc47)) +#loc199 = loc("tmp102"(#loc48)) +#loc200 = loc("tmp20"(#loc49)) +#loc201 = loc("tmp22"(#loc50)) +#loc202 = loc("tmp23"(#loc51)) +#loc203 = loc("tmp24"(#loc52)) +#loc204 = loc("tmp72"(#loc53)) +#loc205 = loc("tmp73"(#loc54)) +#loc206 = loc("tmp74"(#loc55)) +#loc207 = loc("tmp75"(#loc56)) +#loc208 = loc("r0_index"(#loc62)) +#loc209 = loc("r0_mask"(#loc63)) +#loc210 = loc("r0_3"(#loc64)) +#loc211 = loc("r0_4"(#loc65)) +#loc212 = loc("tmp50"(#loc66)) +#loc213 = loc("tmp50"(#loc67)) +#loc214 = loc("tmp58"(#loc68)) +#loc215 = loc("tmp58"(#loc69)) +#loc216 = loc("tmp63"(#loc70)) +#loc217 = loc("tmp66"(#loc71)) +#loc218 = loc("tmp96"(#loc72)) +#loc219 = loc("tmp96"(#loc73)) +#loc220 = loc("tmp96"(#loc74)) +#loc221 = loc("tmp96"(#loc75)) +#loc222 = loc("tmp96"(#loc76)) +#loc223 = loc("tmp96"(#loc77)) +#loc224 = loc("tmp102"(#loc78)) +#loc225 = loc("tmp102"(#loc79)) +#loc226 = loc("tmp16"(#loc80)) +#loc227 = loc("tmp17"(#loc81)) +#loc228 = loc("tmp17"(#loc82)) +#loc229 = loc("tmp17"(#loc83)) +#loc230 = loc("tmp17"(#loc84)) +#loc231 = loc("tmp17"(#loc85)) +#loc232 = loc("tmp17"(#loc86)) +#loc233 = loc("tmp17"(#loc87)) +#loc234 = loc("tmp17"(#loc88)) +#loc235 = loc("tmp25"(#loc89)) +#loc236 = loc("tmp25"(#loc90)) +#loc237 = loc("tmp25"(#loc91)) +#loc238 = loc("tmp27"(#loc92)) +#loc239 = loc("tmp29"(#loc93)) +#loc240 = loc("tmp31"(#loc94)) +#loc241 = loc("tmp32"(#loc95)) +#loc242 = loc("tmp35"(#loc96)) +#loc243 = loc("tmp35"(#loc97)) +#loc244 = loc("tmp35"(#loc98)) +#loc245 = loc("tmp35"(#loc99)) +#loc246 = loc("tmp35"(#loc100)) +#loc247 = loc("tmp35"(#loc101)) +#loc248 = loc("tmp42"(#loc102)) +#loc249 = loc("tmp43"(#loc103)) +#loc250 = loc("tmp43"(#loc104)) +#loc251 = loc("tmp43"(#loc105)) +#loc252 = loc("tmp45"(#loc106)) +#loc253 = loc("tmp48"(#loc107)) +#loc254 = loc("tmp49"(#loc108)) +#loc255 = loc("tmp57"(#loc109)) +#loc256 = loc("tmp60"(#loc110)) +#loc257 = loc("tmp64"(#loc111)) +#loc258 = loc("tmp67"(#loc112)) +#loc259 = loc("tmp68"(#loc113)) +#loc260 = loc("tmp70"(#loc114)) +#loc261 = loc("tmp70"(#loc115)) +#loc262 = loc("tmp70"(#loc116)) +#loc263 = loc("tmp70"(#loc117)) +#loc264 = loc("tmp70"(#loc118)) +#loc265 = loc("tmp70"(#loc119)) +#loc266 = loc("tmp76"(#loc120)) +#loc267 = loc("tmp76"(#loc121)) +#loc268 = loc("tmp76"(#loc122)) +#loc269 = loc("tmp78"(#loc123)) +#loc270 = loc("tmp80"(#loc124)) +#loc271 = loc("tmp83"(#loc125)) +#loc272 = loc("tmp83"(#loc126)) +#loc273 = loc("tmp83"(#loc127)) +#loc274 = loc("tmp83"(#loc128)) +#loc275 = loc("tmp83"(#loc129)) +#loc276 = loc("tmp83"(#loc130)) +#loc277 = loc("tmp88"(#loc131)) +#loc278 = loc("tmp89"(#loc132)) +#loc279 = loc("tmp89"(#loc133)) +#loc280 = loc("tmp89"(#loc134)) +#loc281 = loc("tmp91"(#loc135)) +#loc282 = loc("tmp94"(#loc136)) +#loc283 = loc("tmp95"(#loc137)) +#loc284 = loc("tmp82"(#loc138)) +#loc285 = loc("tmp101"(#loc139)) +#loc286 = loc("tmp104"(#loc140)) +#loc287 = loc("tmp107"(#loc141)) +#loc288 = loc("tmp109"(#loc142)) +#loc289 = loc("tmp110"(#loc143)) +#loc290 = loc("_tmp10"(#loc168)) +#loc291 = loc(callsite(#loc32 at #loc185)) +#loc293 = loc(callsite(#loc32 at #loc187)) +#loc295 = loc(fused[#loc254, #loc240]) +#loc296 = loc(fused[#loc283, #loc284]) +#loc297 = loc(callsite(#loc34 at #loc291)) +#loc298 = loc(callsite(#loc34 at #loc293)) diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..f6dcf3e5eba5525eb089d70ff0a8d18a14ef8875 --- /dev/null +++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir @@ -0,0 +1,520 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc149 = loc("in_out_ptr0"(#loc)) +#loc150 = loc("in_out_ptr1"(#loc)) +#loc151 = loc("in_ptr0"(#loc)) +#loc152 = loc("in_ptr1"(#loc)) +#loc153 = loc("in_ptr2"(#loc)) +#loc154 = loc("in_ptr3"(#loc)) +#loc155 = loc("in_ptr4"(#loc)) +#loc156 = loc("xnumel"(#loc)) +#loc157 = loc("r0_numel"(#loc)) +#loc189 = loc("tmp4"(#loc35)) +#loc191 = loc("tmp10"(#loc38)) +#loc296 = loc(callsite(#loc1 at #loc189)) +#loc298 = loc(callsite(#loc1 at #loc191)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x64xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<4x64xbf16> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_1 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc1) + %cst_6 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc1) + %cst_7 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc158) + %xoffset_13 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc159) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc160) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc161) + %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<4x1xi32> loc(#loc162) + %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<4x1xi32> loc(#loc162) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc163) + %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc164) + %x0 = arith.remsi %xindex_16, %cst_12 : tensor<4x1xi32> loc(#loc165) + %x1 = arith.divsi %xindex_16, %cst_12 : tensor<4x1xi32> loc(#loc166) + %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_20 = %cst_11) -> (tensor<4x64xf32>, tensor<4x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc168) + %r0_index_21 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc168) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst_10 : tensor<1x64xi32> loc(#loc169) + %tmp0 = arith.addi %r0_index_21, %cst_9 : tensor<1x64xi32> loc(#loc170) + %tmp0_22 = arith.muli %x0, %cst_8 : tensor<4x1xi32> loc(#loc171) + %tmp0_23 = tt.broadcast %tmp0 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc172) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc172) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<4x64xi32> loc(#loc172) + %tmp0_26 = arith.muli %x1, %cst_7 : tensor<4x1xi32> loc(#loc173) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc174) + %tmp0_28 = arith.addi %tmp0_25, %tmp0_27 : tensor<4x64xi32> loc(#loc174) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc175) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc175) + %tmp0_31 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc176) + %tmp0_32 = tt.load %tmp0_30, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc176) + %tmp0_33 = arith.extf %tmp0_32 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc177) + %tmp6 = tt.broadcast %r0_index_21 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc178) + %tmp6_34 = arith.addi %tmp6, %tmp0_24 : tensor<4x64xi32> loc(#loc178) + %tmp6_35 = arith.addi %tmp6_34, %tmp0_27 : tensor<4x64xi32> loc(#loc179) + %tmp6_36 = tt.addptr %tmp0_29, %tmp6_35 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc180) + %tmp6_37 = tt.load %tmp6_36, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc181) + %tmp6_38 = arith.extf %tmp6_37 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc182) + %tmp2 = arith.mulf %tmp0_33, %tmp0_33 : tensor<4x64xf32> loc(#loc183) + %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<4x64xf32> loc(#loc184) + %_tmp4_39 = arith.select %tmp0_31, %tmp5, %_tmp4 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc185) + %tmp8 = arith.mulf %tmp6_38, %tmp6_38 : tensor<4x64xf32> loc(#loc186) + %tmp11 = arith.addf %_tmp10_20, %tmp8 : tensor<4x64xf32> loc(#loc187) + %_tmp10_40 = arith.select %tmp0_31, %tmp11, %_tmp10_20 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc188) + scf.yield %_tmp4_39, %_tmp10_40 : tensor<4x64xf32>, tensor<4x64xf32> loc(#loc33) + } loc(#loc294) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc189)), %tmp4_21: f32 loc(callsite(#loc1 at #loc189))): + %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc299) + tt.reduce.return %tmp4_22 : f32 loc(#loc295) + }) : (tensor<4x64xf32>) -> tensor<4xf32> loc(#loc295) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc190) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_20: f32 loc(callsite(#loc1 at #loc191)), %tmp10_21: f32 loc(callsite(#loc1 at #loc191))): + %tmp10_22 = arith.addf %tmp10_20, %tmp10_21 : f32 loc(#loc300) + tt.reduce.return %tmp10_22 : f32 loc(#loc297) + }) : (tensor<4x64xf32>) -> tensor<4xf32> loc(#loc297) + %tmp10_19 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc192) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc193) + %r0_index_20 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc193) + %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x64xi32> loc(#loc194) + %r0_3 = arith.remsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc195) + %r0_4 = arith.divsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc196) + %tmp50 = arith.muli %x0, %cst_8 : tensor<4x1xi32> loc(#loc197) + %tmp50_21 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc198) + %tmp50_22 = tt.broadcast %tmp50 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc198) + %tmp50_23 = arith.addi %tmp50_21, %tmp50_22 : tensor<4x64xi32> loc(#loc198) + %tmp50_24 = arith.muli %x1, %cst_7 : tensor<4x1xi32> loc(#loc199) + %tmp50_25 = tt.broadcast %tmp50_24 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc200) + %tmp50_26 = arith.addi %tmp50_23, %tmp50_25 : tensor<4x64xi32> loc(#loc200) + %tmp50_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc201) + %tmp50_28 = tt.addptr %tmp50_27, %tmp50_26 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc201) + %tmp50_29 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc202) + %tmp50_30 = tt.load %tmp50_28, %tmp50_29, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc202) + %tmp50_31 = arith.extf %tmp50_30 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc203) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc204) + %tmp58_32 = tt.addptr %tmp58, %r0_index_20 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc204) + %tmp58_33 = tt.load %tmp58_32, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc205) + %tmp58_34 = arith.extf %tmp58_33 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc206) + %tmp63 = arith.muli %x1, %cst_8 : tensor<4x1xi32> loc(#loc207) + %tmp63_35 = tt.broadcast %tmp63 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc208) + %tmp63_36 = arith.addi %tmp50_21, %tmp63_35 : tensor<4x64xi32> loc(#loc208) + %tmp63_37 = tt.splat %in_ptr2 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc209) + %tmp63_38 = tt.addptr %tmp63_37, %tmp63_36 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc209) + %tmp63_39 = tt.load %tmp63_38, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc210) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc211) + %tmp66_40 = tt.addptr %tmp66, %tmp63_36 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc211) + %tmp66_41 = tt.load %tmp66_40, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc212) + %tmp96 = arith.addi %r0_index_20, %cst_9 : tensor<1x64xi32> loc(#loc213) + %tmp96_42 = tt.broadcast %tmp96 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc214) + %tmp96_43 = arith.addi %tmp96_42, %tmp50_22 : tensor<4x64xi32> loc(#loc214) + %tmp96_44 = arith.addi %tmp96_43, %tmp50_25 : tensor<4x64xi32> loc(#loc215) + %tmp96_45 = tt.addptr %tmp50_27, %tmp96_44 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc216) + %tmp96_46 = tt.load %tmp96_45, %tmp50_29, %cst_0 evictionPolicy = evict_first : tensor<4x64x!tt.ptr> loc(#loc217) + %tmp96_47 = arith.extf %tmp96_46 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc218) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc219) + %tmp102_48 = tt.addptr %tmp102, %r0_index_20 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc219) + %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc220) + %tmp102_50 = arith.extf %tmp102_49 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc221) + %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc222) + %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc222) + %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x64xi32> loc(#loc223) + %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x64xi32> loc(#loc224) + %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc225) + %tmp17_54 = arith.addi %tmp17_53, %tmp50_22 : tensor<4x64xi32> loc(#loc225) + %tmp17_55 = arith.addi %tmp17_54, %tmp50_25 : tensor<4x64xi32> loc(#loc226) + %tmp17_56 = tt.addptr %tmp50_27, %tmp17_55 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc227) + %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x64xi1> loc(#loc228) + %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc229) + %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc229) + %tmp17_60 = arith.extf %tmp17_59 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc230) + %tmp20 = arith.divf %tmp10_19, %cst_3 : tensor<4x1xf32> loc(#loc231) + %tmp22 = arith.addf %tmp20, %cst_2 : tensor<4x1xf32> loc(#loc232) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc233) + %tmp24 = tt.broadcast %tmp23 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc234) + %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<4x64xf32> loc(#loc234) + %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc235) + %tmp25_62 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr> -> tensor<4x64x!tt.ptr> loc(#loc235) + %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc236) + %tmp25_64 = arith.extf %tmp25_63 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc237) + %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<4x64xf32> loc(#loc238) + %tmp29 = arith.subf %cst_11, %tmp27 : tensor<4x64xf32> loc(#loc239) + %tmp31 = tt.broadcast %tmp16_51 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc240) + %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc240) + %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc241) + %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc242) + %tmp35_66 = arith.addi %tmp35, %tmp50_22 : tensor<4x64xi32> loc(#loc242) + %tmp35_67 = arith.addi %tmp35_66, %tmp50_25 : tensor<4x64xi32> loc(#loc243) + %tmp35_68 = tt.addptr %tmp50_27, %tmp35_67 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc244) + %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x64xi1> loc(#loc245) + %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc246) + %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc246) + %tmp35_72 = arith.extf %tmp35_71 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc247) + %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<4x64xf32> loc(#loc248) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc249) + %tmp43_73 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr> -> tensor<4x64x!tt.ptr> loc(#loc249) + %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc250) + %tmp43_75 = arith.extf %tmp43_74 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc251) + %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<4x64xf32> loc(#loc252) + %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc253) + %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc253) + %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc254) + %tmp57 = arith.mulf %tmp50_31, %tmp24 : tensor<4x64xf32> loc(#loc255) + %tmp60 = tt.broadcast %tmp58_34 : tensor<1x64xf32> -> tensor<4x64xf32> loc(#loc256) + %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<4x64xf32> loc(#loc256) + %tmp64 = arith.mulf %tmp60_77, %tmp63_39 : tensor<4x64xf32> loc(#loc257) + %tmp67 = arith.mulf %tmp49, %tmp66_41 : tensor<4x64xf32> loc(#loc258) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<4x64xf32> loc(#loc259) + %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32> loc(#loc260) + %tmp70_78 = tt.broadcast %tmp70 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc261) + %tmp70_79 = arith.addi %tmp70_78, %tmp50_22 : tensor<4x64xi32> loc(#loc261) + %tmp70_80 = arith.addi %tmp70_79, %tmp50_25 : tensor<4x64xi32> loc(#loc262) + %tmp70_81 = tt.addptr %tmp50_27, %tmp70_80 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc263) + %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc264) + %tmp70_83 = arith.extf %tmp70_82 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc265) + %tmp72 = arith.divf %tmp4_18, %cst_3 : tensor<4x1xf32> loc(#loc266) + %tmp73 = arith.addf %tmp72, %cst_2 : tensor<4x1xf32> loc(#loc267) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc268) + %tmp75 = tt.broadcast %tmp74 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc269) + %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<4x64xf32> loc(#loc269) + %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc270) + %tmp76_85 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr> -> tensor<4x64x!tt.ptr> loc(#loc270) + %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc271) + %tmp76_87 = arith.extf %tmp76_86 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc272) + %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<4x64xf32> loc(#loc273) + %tmp80 = arith.subf %cst_11, %tmp78 : tensor<4x64xf32> loc(#loc274) + %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc275) + %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x64xi32> loc(#loc276) + %tmp83_88 = tt.broadcast %tmp83 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc277) + %tmp83_89 = arith.addi %tmp83_88, %tmp50_22 : tensor<4x64xi32> loc(#loc277) + %tmp83_90 = arith.addi %tmp83_89, %tmp50_25 : tensor<4x64xi32> loc(#loc278) + %tmp83_91 = tt.addptr %tmp50_27, %tmp83_90 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc279) + %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc280) + %tmp83_93 = arith.extf %tmp83_92 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc281) + %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<4x64xf32> loc(#loc282) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc283) + %tmp89_94 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr> -> tensor<4x64x!tt.ptr> loc(#loc283) + %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr> loc(#loc284) + %tmp89_96 = arith.extf %tmp89_95 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc285) + %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<4x64xf32> loc(#loc286) + %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc287) + %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc288) + %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<4x64xf32> loc(#loc289) + %tmp104 = tt.broadcast %tmp102_50 : tensor<1x64xf32> -> tensor<4x64xf32> loc(#loc290) + %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<4x64xf32> loc(#loc290) + %tmp107 = arith.mulf %tmp104_97, %tmp63_39 : tensor<4x64xf32> loc(#loc291) + %tmp109 = arith.mulf %tmp95, %tmp66_41 : tensor<4x64xf32> loc(#loc292) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<4x64xf32> loc(#loc293) + %0 = arith.muli %xindex_16, %cst_8 : tensor<4x1xi32> loc(#loc142) + %1 = tt.broadcast %0 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc143) + %2 = arith.addi %tmp50_21, %1 : tensor<4x64xi32> loc(#loc143) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc144) + %4 = tt.addptr %3, %2 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc144) + %5 = arith.truncf %tmp68 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc145) + tt.store %4, %5, %tmp50_29 : tensor<4x64x!tt.ptr> loc(#loc145) + %6 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<4x64x!tt.ptr> loc(#loc146) + %7 = tt.addptr %6, %2 : tensor<4x64x!tt.ptr>, tensor<4x64xi32> loc(#loc146) + %8 = arith.truncf %tmp110 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc147) + tt.store %7, %8, %tmp50_29 : tensor<4x64x!tt.ptr> loc(#loc147) + } loc(#loc40) + tt.return loc(#loc148) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc158 = loc("xoffset"(#loc2)) +#loc159 = loc("xoffset"(#loc3)) +#loc160 = loc("xindex"(#loc4)) +#loc161 = loc("xindex"(#loc5)) +#loc162 = loc("xindex"(#loc6)) +#loc163 = loc("r0_base"(#loc7)) +#loc164 = loc("r0_base"(#loc8)) +#loc165 = loc("x0"(#loc9)) +#loc166 = loc("x1"(#loc10)) +#loc167 = loc("_tmp4"(#loc11)) +#loc168 = loc("r0_index"(#loc12)) +#loc169 = loc("r0_mask"(#loc13)) +#loc170 = loc("tmp0"(#loc14)) +#loc171 = loc("tmp0"(#loc15)) +#loc172 = loc("tmp0"(#loc16)) +#loc173 = loc("tmp0"(#loc17)) +#loc174 = loc("tmp0"(#loc18)) +#loc175 = loc("tmp0"(#loc19)) +#loc176 = loc("tmp0"(#loc20)) +#loc177 = loc("tmp0"(#loc21)) +#loc178 = loc("tmp6"(#loc22)) +#loc179 = loc("tmp6"(#loc23)) +#loc180 = loc("tmp6"(#loc24)) +#loc181 = loc("tmp6"(#loc25)) +#loc182 = loc("tmp6"(#loc26)) +#loc183 = loc("tmp2"(#loc27)) +#loc184 = loc("tmp5"(#loc28)) +#loc185 = loc("_tmp4"(#loc29)) +#loc186 = loc("tmp8"(#loc30)) +#loc187 = loc("tmp11"(#loc31)) +#loc188 = loc("_tmp10"(#loc32)) +#loc190 = loc("tmp4"(#loc37)) +#loc192 = loc("tmp10"(#loc39)) +#loc193 = loc("r0_index"(#loc41)) +#loc194 = loc("r0_mask"(#loc42)) +#loc195 = loc("r0_3"(#loc43)) +#loc196 = loc("r0_4"(#loc44)) +#loc197 = loc("tmp50"(#loc45)) +#loc198 = loc("tmp50"(#loc46)) +#loc199 = loc("tmp50"(#loc47)) +#loc200 = loc("tmp50"(#loc48)) +#loc201 = loc("tmp50"(#loc49)) +#loc202 = loc("tmp50"(#loc50)) +#loc203 = loc("tmp50"(#loc51)) +#loc204 = loc("tmp58"(#loc52)) +#loc205 = loc("tmp58"(#loc53)) +#loc206 = loc("tmp58"(#loc54)) +#loc207 = loc("tmp63"(#loc55)) +#loc208 = loc("tmp63"(#loc56)) +#loc209 = loc("tmp63"(#loc57)) +#loc210 = loc("tmp63"(#loc58)) +#loc211 = loc("tmp66"(#loc59)) +#loc212 = loc("tmp66"(#loc60)) +#loc213 = loc("tmp96"(#loc61)) +#loc214 = loc("tmp96"(#loc62)) +#loc215 = loc("tmp96"(#loc63)) +#loc216 = loc("tmp96"(#loc64)) +#loc217 = loc("tmp96"(#loc65)) +#loc218 = loc("tmp96"(#loc66)) +#loc219 = loc("tmp102"(#loc67)) +#loc220 = loc("tmp102"(#loc68)) +#loc221 = loc("tmp102"(#loc69)) +#loc222 = loc("tmp16"(#loc70)) +#loc223 = loc("tmp17"(#loc71)) +#loc224 = loc("tmp17"(#loc72)) +#loc225 = loc("tmp17"(#loc73)) +#loc226 = loc("tmp17"(#loc74)) +#loc227 = loc("tmp17"(#loc75)) +#loc228 = loc("tmp17"(#loc76)) +#loc229 = loc("tmp17"(#loc77)) +#loc230 = loc("tmp17"(#loc78)) +#loc231 = loc("tmp20"(#loc79)) +#loc232 = loc("tmp22"(#loc80)) +#loc233 = loc("tmp23"(#loc81)) +#loc234 = loc("tmp24"(#loc82)) +#loc235 = loc("tmp25"(#loc83)) +#loc236 = loc("tmp25"(#loc84)) +#loc237 = loc("tmp25"(#loc85)) +#loc238 = loc("tmp27"(#loc86)) +#loc239 = loc("tmp29"(#loc87)) +#loc240 = loc("tmp31"(#loc88)) +#loc241 = loc("tmp32"(#loc89)) +#loc242 = loc("tmp35"(#loc90)) +#loc243 = loc("tmp35"(#loc91)) +#loc244 = loc("tmp35"(#loc92)) +#loc245 = loc("tmp35"(#loc93)) +#loc246 = loc("tmp35"(#loc94)) +#loc247 = loc("tmp35"(#loc95)) +#loc248 = loc("tmp42"(#loc96)) +#loc249 = loc("tmp43"(#loc97)) +#loc250 = loc("tmp43"(#loc98)) +#loc251 = loc("tmp43"(#loc99)) +#loc252 = loc("tmp45"(#loc100)) +#loc253 = loc("tmp48"(#loc101)) +#loc254 = loc("tmp49"(#loc102)) +#loc255 = loc("tmp57"(#loc103)) +#loc256 = loc("tmp60"(#loc104)) +#loc257 = loc("tmp64"(#loc105)) +#loc258 = loc("tmp67"(#loc106)) +#loc259 = loc("tmp68"(#loc107)) +#loc260 = loc("tmp70"(#loc108)) +#loc261 = loc("tmp70"(#loc109)) +#loc262 = loc("tmp70"(#loc110)) +#loc263 = loc("tmp70"(#loc111)) +#loc264 = loc("tmp70"(#loc112)) +#loc265 = loc("tmp70"(#loc113)) +#loc266 = loc("tmp72"(#loc114)) +#loc267 = loc("tmp73"(#loc115)) +#loc268 = loc("tmp74"(#loc116)) +#loc269 = loc("tmp75"(#loc117)) +#loc270 = loc("tmp76"(#loc118)) +#loc271 = loc("tmp76"(#loc119)) +#loc272 = loc("tmp76"(#loc120)) +#loc273 = loc("tmp78"(#loc121)) +#loc274 = loc("tmp80"(#loc122)) +#loc275 = loc("tmp82"(#loc123)) +#loc276 = loc("tmp83"(#loc124)) +#loc277 = loc("tmp83"(#loc125)) +#loc278 = loc("tmp83"(#loc126)) +#loc279 = loc("tmp83"(#loc127)) +#loc280 = loc("tmp83"(#loc128)) +#loc281 = loc("tmp83"(#loc129)) +#loc282 = loc("tmp88"(#loc130)) +#loc283 = loc("tmp89"(#loc131)) +#loc284 = loc("tmp89"(#loc132)) +#loc285 = loc("tmp89"(#loc133)) +#loc286 = loc("tmp91"(#loc134)) +#loc287 = loc("tmp94"(#loc135)) +#loc288 = loc("tmp95"(#loc136)) +#loc289 = loc("tmp101"(#loc137)) +#loc290 = loc("tmp104"(#loc138)) +#loc291 = loc("tmp107"(#loc139)) +#loc292 = loc("tmp109"(#loc140)) +#loc293 = loc("tmp110"(#loc141)) +#loc294 = loc("_tmp10"(#loc167)) +#loc295 = loc(callsite(#loc34 at #loc189)) +#loc297 = loc(callsite(#loc34 at #loc191)) +#loc299 = loc(callsite(#loc36 at #loc295)) +#loc300 = loc(callsite(#loc36 at #loc297)) diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a420e8cd9e9ae76456d920f45dd4b423201c2a9b --- /dev/null +++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json"}} \ No newline at end of file diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..f293897e93aa63d1726bb0aee45921e1aa487f5e Binary files /dev/null and b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e7c8affebb3d3cd055f0e1d6e44bb48abe7d50b6 --- /dev/null +++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"hash": "0ef7a0ff688a6afafd44bc049e07d545eb9567d298d7625d0e59aada3a6fc149", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"} \ No newline at end of file diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..071e1b38becc22671c480565256e1ffc7bf92768 --- /dev/null +++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir @@ -0,0 +1,795 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10 + %15 = mul nuw i32 %13, %14, !dbg !11 + %16 = add nuw i32 %15, %12, !dbg !12 + %17 = shl i32 %16, 10, !dbg !13 + %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14 + %19 = and i32 %18, 127, !dbg !14 + %20 = shl nuw nsw i32 %19, 2, !dbg !14 + %21 = or disjoint i32 %17, %20, !dbg !15 + %22 = or disjoint i32 %21, 1, !dbg !15 + %23 = or disjoint i32 %21, 2, !dbg !15 + %24 = or disjoint i32 %21, 3, !dbg !15 + %25 = or disjoint i32 %21, 512, !dbg !15 + %26 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16 + %27 = icmp samesign ult i32 %26, 128, !dbg !17 + %28 = sdiv i32 %21, 32, !dbg !18 + %29 = sdiv i32 %25, 32, !dbg !18 + %30 = mul i32 %28, 32, !dbg !19 + %.decomposed = sub i32 %21, %30, !dbg !19 + %31 = srem i32 %22, 32, !dbg !19 + %32 = srem i32 %23, 32, !dbg !19 + %33 = srem i32 %24, 32, !dbg !19 + %34 = icmp slt i32 %21, 8192, !dbg !20 + %35 = shl nsw i32 %.decomposed, 7, !dbg !21 + %36 = shl nsw i32 %31, 7, !dbg !21 + %37 = shl nsw i32 %32, 7, !dbg !21 + %38 = shl nsw i32 %33, 7, !dbg !21 + %39 = add i32 %35, %26, !dbg !22 + %40 = add i32 %36, %26, !dbg !22 + %41 = add i32 %37, %26, !dbg !22 + %42 = add i32 %38, %26, !dbg !22 + %43 = mul i32 %28, 12288, !dbg !23 + %44 = mul i32 %29, 12288, !dbg !23 + %45 = add i32 %39, %43, !dbg !24 + %46 = add i32 %40, %43, !dbg !24 + %47 = add i32 %41, %43, !dbg !24 + %48 = add i32 %42, %43, !dbg !24 + %49 = add i32 %39, %44, !dbg !24 + %50 = add i32 %40, %44, !dbg !24 + %51 = add i32 %41, %44, !dbg !24 + %52 = add i32 %42, %44, !dbg !24 + %53 = sext i32 %45 to i64, !dbg !25 + %54 = getelementptr bfloat, ptr addrspace(1) %0, i64 %53, !dbg !25 + %55 = sext i32 %46 to i64, !dbg !25 + %56 = getelementptr bfloat, ptr addrspace(1) %0, i64 %55, !dbg !25 + %57 = sext i32 %47 to i64, !dbg !25 + %58 = getelementptr bfloat, ptr addrspace(1) %0, i64 %57, !dbg !25 + %59 = sext i32 %48 to i64, !dbg !25 + %60 = getelementptr bfloat, ptr addrspace(1) %0, i64 %59, !dbg !25 + %61 = sext i32 %49 to i64, !dbg !25 + %62 = getelementptr bfloat, ptr addrspace(1) %0, i64 %61, !dbg !25 + %63 = sext i32 %50 to i64, !dbg !25 + %64 = getelementptr bfloat, ptr addrspace(1) %0, i64 %63, !dbg !25 + %65 = sext i32 %51 to i64, !dbg !25 + %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %65, !dbg !25 + %67 = sext i32 %52 to i64, !dbg !25 + %68 = getelementptr bfloat, ptr addrspace(1) %0, i64 %67, !dbg !25 + %69 = and i1 %27, %34, !dbg !26 + %70 = icmp slt i32 %21, 7680, !dbg !27 + %71 = and i1 %27, %70, !dbg !27 + %72 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28 + %73 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %54, i64 %72, i1 %69) #6, !dbg !28 + %74 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28 + %75 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %56, i64 %74, i1 %69) #6, !dbg !28 + %76 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28 + %77 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %58, i64 %76, i1 %69) #6, !dbg !28 + %78 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28 + %79 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %60, i64 %78, i1 %69) #6, !dbg !28 + %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28 + %81 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %62, i64 %80, i1 %71) #6, !dbg !28 + %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28 + %83 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %64, i64 %82, i1 %71) #6, !dbg !28 + %84 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28 + %85 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %66, i64 %84, i1 %71) #6, !dbg !28 + %86 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28 + %87 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %68, i64 %86, i1 %71) #6, !dbg !28 + %88 = sext i32 %21 to i64, !dbg !29 + %89 = getelementptr float, ptr addrspace(1) %1, i64 %88, !dbg !29 + %90 = sext i32 %25 to i64, !dbg !29 + %91 = getelementptr float, ptr addrspace(1) %1, i64 %90, !dbg !29 + %92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !30 + %93 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %89, i64 %92, i1 %69) #6, !dbg !30 + %94 = extractvalue { i32, i32, i32, i32 } %93, 0, !dbg !30 + %95 = extractvalue { i32, i32, i32, i32 } %93, 1, !dbg !30 + %96 = extractvalue { i32, i32, i32, i32 } %93, 2, !dbg !30 + %97 = extractvalue { i32, i32, i32, i32 } %93, 3, !dbg !30 + %98 = bitcast i32 %94 to float, !dbg !30 + %99 = bitcast i32 %95 to float, !dbg !30 + %100 = bitcast i32 %96 to float, !dbg !30 + %101 = bitcast i32 %97 to float, !dbg !30 + %102 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !30 + %103 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %91, i64 %102, i1 %71) #6, !dbg !30 + %104 = extractvalue { i32, i32, i32, i32 } %103, 0, !dbg !30 + %105 = extractvalue { i32, i32, i32, i32 } %103, 1, !dbg !30 + %106 = extractvalue { i32, i32, i32, i32 } %103, 2, !dbg !30 + %107 = extractvalue { i32, i32, i32, i32 } %103, 3, !dbg !30 + %108 = bitcast i32 %104 to float, !dbg !30 + %109 = bitcast i32 %105 to float, !dbg !30 + %110 = bitcast i32 %106 to float, !dbg !30 + %111 = bitcast i32 %107 to float, !dbg !30 + %112 = tail call float @llvm.nvvm.div.full(float %98, float 1.280000e+02), !dbg !31 + %113 = tail call float @llvm.nvvm.div.full(float %99, float 1.280000e+02), !dbg !31 + %114 = tail call float @llvm.nvvm.div.full(float %100, float 1.280000e+02), !dbg !31 + %115 = tail call float @llvm.nvvm.div.full(float %101, float 1.280000e+02), !dbg !31 + %116 = tail call float @llvm.nvvm.div.full(float %108, float 1.280000e+02), !dbg !31 + %117 = tail call float @llvm.nvvm.div.full(float %109, float 1.280000e+02), !dbg !31 + %118 = tail call float @llvm.nvvm.div.full(float %110, float 1.280000e+02), !dbg !31 + %119 = tail call float @llvm.nvvm.div.full(float %111, float 1.280000e+02), !dbg !31 + %120 = fadd float %112, 0x3EB0C6F7A0000000, !dbg !32 + %121 = fadd float %113, 0x3EB0C6F7A0000000, !dbg !32 + %122 = fadd float %114, 0x3EB0C6F7A0000000, !dbg !32 + %123 = fadd float %115, 0x3EB0C6F7A0000000, !dbg !32 + %124 = fadd float %116, 0x3EB0C6F7A0000000, !dbg !32 + %125 = fadd float %117, 0x3EB0C6F7A0000000, !dbg !32 + %126 = fadd float %118, 0x3EB0C6F7A0000000, !dbg !32 + %127 = fadd float %119, 0x3EB0C6F7A0000000, !dbg !32 + %128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33 + %.not.i = icmp eq i32 %128, 0, !dbg !33 + br i1 %.not.i, label %131, label %129, !dbg !33 + +129: ; preds = %11 + %130 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %120), !dbg !33 + br label %__nv_rsqrtf.exit, !dbg !33 + +131: ; preds = %11 + %132 = tail call float @llvm.nvvm.rsqrt.approx.f(float %120), !dbg !33 + br label %__nv_rsqrtf.exit, !dbg !33 + +__nv_rsqrtf.exit: ; preds = %129, %131 + %.0.i = phi float [ %130, %129 ], [ %132, %131 ], !dbg !33 + %133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33 + %.not.i7 = icmp eq i32 %133, 0, !dbg !33 + br i1 %.not.i7, label %136, label %134, !dbg !33 + +134: ; preds = %__nv_rsqrtf.exit + %135 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %121), !dbg !33 + br label %__nv_rsqrtf.exit9, !dbg !33 + +136: ; preds = %__nv_rsqrtf.exit + %137 = tail call float @llvm.nvvm.rsqrt.approx.f(float %121), !dbg !33 + br label %__nv_rsqrtf.exit9, !dbg !33 + +__nv_rsqrtf.exit9: ; preds = %134, %136 + %.0.i8 = phi float [ %135, %134 ], [ %137, %136 ], !dbg !33 + %138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33 + %.not.i10 = icmp eq i32 %138, 0, !dbg !33 + br i1 %.not.i10, label %141, label %139, !dbg !33 + +139: ; preds = %__nv_rsqrtf.exit9 + %140 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %122), !dbg !33 + br label %__nv_rsqrtf.exit12, !dbg !33 + +141: ; preds = %__nv_rsqrtf.exit9 + %142 = tail call float @llvm.nvvm.rsqrt.approx.f(float %122), !dbg !33 + br label %__nv_rsqrtf.exit12, !dbg !33 + +__nv_rsqrtf.exit12: ; preds = %139, %141 + %.0.i11 = phi float [ %140, %139 ], [ %142, %141 ], !dbg !33 + %143 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33 + %.not.i13 = icmp eq i32 %143, 0, !dbg !33 + br i1 %.not.i13, label %146, label %144, !dbg !33 + +144: ; preds = %__nv_rsqrtf.exit12 + %145 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %123), !dbg !33 + br label %__nv_rsqrtf.exit15, !dbg !33 + +146: ; preds = %__nv_rsqrtf.exit12 + %147 = tail call float @llvm.nvvm.rsqrt.approx.f(float %123), !dbg !33 + br label %__nv_rsqrtf.exit15, !dbg !33 + +__nv_rsqrtf.exit15: ; preds = %144, %146 + %.0.i14 = phi float [ %145, %144 ], [ %147, %146 ], !dbg !33 + %148 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33 + %.not.i16 = icmp eq i32 %148, 0, !dbg !33 + br i1 %.not.i16, label %151, label %149, !dbg !33 + +149: ; preds = %__nv_rsqrtf.exit15 + %150 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %124), !dbg !33 + br label %__nv_rsqrtf.exit18, !dbg !33 + +151: ; preds = %__nv_rsqrtf.exit15 + %152 = tail call float @llvm.nvvm.rsqrt.approx.f(float %124), !dbg !33 + br label %__nv_rsqrtf.exit18, !dbg !33 + +__nv_rsqrtf.exit18: ; preds = %149, %151 + %.0.i17 = phi float [ %150, %149 ], [ %152, %151 ], !dbg !33 + %153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33 + %.not.i19 = icmp eq i32 %153, 0, !dbg !33 + br i1 %.not.i19, label %156, label %154, !dbg !33 + +154: ; preds = %__nv_rsqrtf.exit18 + %155 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %125), !dbg !33 + br label %__nv_rsqrtf.exit21, !dbg !33 + +156: ; preds = %__nv_rsqrtf.exit18 + %157 = tail call float @llvm.nvvm.rsqrt.approx.f(float %125), !dbg !33 + br label %__nv_rsqrtf.exit21, !dbg !33 + +__nv_rsqrtf.exit21: ; preds = %154, %156 + %.0.i20 = phi float [ %155, %154 ], [ %157, %156 ], !dbg !33 + %158 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33 + %.not.i22 = icmp eq i32 %158, 0, !dbg !33 + br i1 %.not.i22, label %161, label %159, !dbg !33 + +159: ; preds = %__nv_rsqrtf.exit21 + %160 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %126), !dbg !33 + br label %__nv_rsqrtf.exit24, !dbg !33 + +161: ; preds = %__nv_rsqrtf.exit21 + %162 = tail call float @llvm.nvvm.rsqrt.approx.f(float %126), !dbg !33 + br label %__nv_rsqrtf.exit24, !dbg !33 + +__nv_rsqrtf.exit24: ; preds = %159, %161 + %.0.i23 = phi float [ %160, %159 ], [ %162, %161 ], !dbg !33 + %163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33 + %.not.i25 = icmp eq i32 %163, 0, !dbg !33 + br i1 %.not.i25, label %166, label %164, !dbg !33 + +164: ; preds = %__nv_rsqrtf.exit24 + %165 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %127), !dbg !33 + br label %__nv_rsqrtf.exit27, !dbg !33 + +166: ; preds = %__nv_rsqrtf.exit24 + %167 = tail call float @llvm.nvvm.rsqrt.approx.f(float %127), !dbg !33 + br label %__nv_rsqrtf.exit27, !dbg !33 + +__nv_rsqrtf.exit27: ; preds = %164, %166 + %.0.i26 = phi float [ %165, %164 ], [ %167, %166 ], !dbg !33 + %168 = zext nneg i32 %26 to i64, !dbg !34 + %169 = getelementptr bfloat, ptr addrspace(1) %2, i64 %168, !dbg !34 + %170 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35 + %171 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %170, i1 %69) #6, !dbg !35 + %172 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35 + %173 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %172, i1 %69) #6, !dbg !35 + %174 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35 + %175 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %174, i1 %69) #6, !dbg !35 + %176 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35 + %177 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %176, i1 %69) #6, !dbg !35 + %178 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35 + %179 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %178, i1 %71) #6, !dbg !35 + %180 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35 + %181 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %180, i1 %71) #6, !dbg !35 + %182 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35 + %183 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %182, i1 %71) #6, !dbg !35 + %184 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35 + %185 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %184, i1 %71) #6, !dbg !35 + %186 = add nsw i32 %28, -256, !dbg !36 + %187 = add nsw i32 %29, -256, !dbg !36 + %188 = mul i32 %186, 12288, !dbg !37 + %189 = mul i32 %187, 12288, !dbg !37 + %190 = add i32 %39, %188, !dbg !38 + %191 = add i32 %40, %188, !dbg !38 + %192 = add i32 %41, %188, !dbg !38 + %193 = add i32 %42, %188, !dbg !38 + %194 = add i32 %39, %189, !dbg !38 + %195 = add i32 %40, %189, !dbg !38 + %196 = add i32 %41, %189, !dbg !38 + %197 = add i32 %42, %189, !dbg !38 + %198 = sext i32 %190 to i64, !dbg !39 + %199 = getelementptr bfloat, ptr addrspace(1) %3, i64 %198, !dbg !39 + %200 = sext i32 %191 to i64, !dbg !39 + %201 = getelementptr bfloat, ptr addrspace(1) %3, i64 %200, !dbg !39 + %202 = sext i32 %192 to i64, !dbg !39 + %203 = getelementptr bfloat, ptr addrspace(1) %3, i64 %202, !dbg !39 + %204 = sext i32 %193 to i64, !dbg !39 + %205 = getelementptr bfloat, ptr addrspace(1) %3, i64 %204, !dbg !39 + %206 = sext i32 %194 to i64, !dbg !39 + %207 = getelementptr bfloat, ptr addrspace(1) %3, i64 %206, !dbg !39 + %208 = sext i32 %195 to i64, !dbg !39 + %209 = getelementptr bfloat, ptr addrspace(1) %3, i64 %208, !dbg !39 + %210 = sext i32 %196 to i64, !dbg !39 + %211 = getelementptr bfloat, ptr addrspace(1) %3, i64 %210, !dbg !39 + %212 = sext i32 %197 to i64, !dbg !39 + %213 = getelementptr bfloat, ptr addrspace(1) %3, i64 %212, !dbg !39 + %214 = add i32 %17, -8192, !dbg !40 + %215 = icmp ult i32 %214, 65536, !dbg !40 + %216 = and i1 %27, %215, !dbg !40 + %217 = add i32 %17, -7680, !dbg !40 + %218 = icmp ult i32 %217, 66048, !dbg !40 + %219 = and i1 %27, %218, !dbg !40 + %220 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41 + %221 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %199, i64 %220, i1 %216) #6, !dbg !41 + %222 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41 + %223 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %201, i64 %222, i1 %216) #6, !dbg !41 + %224 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41 + %225 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %203, i64 %224, i1 %216) #6, !dbg !41 + %226 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41 + %227 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %205, i64 %226, i1 %216) #6, !dbg !41 + %228 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41 + %229 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %207, i64 %228, i1 %219) #6, !dbg !41 + %230 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41 + %231 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %209, i64 %230, i1 %219) #6, !dbg !41 + %232 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41 + %233 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %211, i64 %232, i1 %219) #6, !dbg !41 + %234 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41 + %235 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %213, i64 %234, i1 %219) #6, !dbg !41 + %236 = shl i32 %186, 5, !dbg !42 + %237 = shl i32 %187, 5, !dbg !42 + %238 = add i32 %236, %.decomposed, !dbg !43 + %239 = add i32 %237, %.decomposed, !dbg !43 + %240 = sext i32 %238 to i64, !dbg !44 + %241 = getelementptr float, ptr addrspace(1) %4, i64 %240, !dbg !44 + %242 = sext i32 %239 to i64, !dbg !44 + %243 = getelementptr float, ptr addrspace(1) %4, i64 %242, !dbg !44 + %244 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !45 + %245 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %241, i64 %244, i1 %216) #6, !dbg !45 + %246 = extractvalue { i32, i32, i32, i32 } %245, 0, !dbg !45 + %247 = extractvalue { i32, i32, i32, i32 } %245, 1, !dbg !45 + %248 = extractvalue { i32, i32, i32, i32 } %245, 2, !dbg !45 + %249 = extractvalue { i32, i32, i32, i32 } %245, 3, !dbg !45 + %250 = bitcast i32 %246 to float, !dbg !45 + %251 = bitcast i32 %247 to float, !dbg !45 + %252 = bitcast i32 %248 to float, !dbg !45 + %253 = bitcast i32 %249 to float, !dbg !45 + %254 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !45 + %255 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %243, i64 %254, i1 %219) #6, !dbg !45 + %256 = extractvalue { i32, i32, i32, i32 } %255, 0, !dbg !45 + %257 = extractvalue { i32, i32, i32, i32 } %255, 1, !dbg !45 + %258 = extractvalue { i32, i32, i32, i32 } %255, 2, !dbg !45 + %259 = extractvalue { i32, i32, i32, i32 } %255, 3, !dbg !45 + %260 = bitcast i32 %256 to float, !dbg !45 + %261 = bitcast i32 %257 to float, !dbg !45 + %262 = bitcast i32 %258 to float, !dbg !45 + %263 = bitcast i32 %259 to float, !dbg !45 + %264 = tail call float @llvm.nvvm.div.full(float %250, float 1.280000e+02), !dbg !46 + %265 = tail call float @llvm.nvvm.div.full(float %251, float 1.280000e+02), !dbg !46 + %266 = tail call float @llvm.nvvm.div.full(float %252, float 1.280000e+02), !dbg !46 + %267 = tail call float @llvm.nvvm.div.full(float %253, float 1.280000e+02), !dbg !46 + %268 = tail call float @llvm.nvvm.div.full(float %260, float 1.280000e+02), !dbg !46 + %269 = tail call float @llvm.nvvm.div.full(float %261, float 1.280000e+02), !dbg !46 + %270 = tail call float @llvm.nvvm.div.full(float %262, float 1.280000e+02), !dbg !46 + %271 = tail call float @llvm.nvvm.div.full(float %263, float 1.280000e+02), !dbg !46 + %272 = fadd float %264, 0x3EB0C6F7A0000000, !dbg !47 + %273 = fadd float %265, 0x3EB0C6F7A0000000, !dbg !47 + %274 = fadd float %266, 0x3EB0C6F7A0000000, !dbg !47 + %275 = fadd float %267, 0x3EB0C6F7A0000000, !dbg !47 + %276 = fadd float %268, 0x3EB0C6F7A0000000, !dbg !47 + %277 = fadd float %269, 0x3EB0C6F7A0000000, !dbg !47 + %278 = fadd float %270, 0x3EB0C6F7A0000000, !dbg !47 + %279 = fadd float %271, 0x3EB0C6F7A0000000, !dbg !47 + %280 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48 + %.not.i28 = icmp eq i32 %280, 0, !dbg !48 + br i1 %.not.i28, label %283, label %281, !dbg !48 + +281: ; preds = %__nv_rsqrtf.exit27 + %282 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %272), !dbg !48 + br label %__nv_rsqrtf.exit30, !dbg !48 + +283: ; preds = %__nv_rsqrtf.exit27 + %284 = tail call float @llvm.nvvm.rsqrt.approx.f(float %272), !dbg !48 + br label %__nv_rsqrtf.exit30, !dbg !48 + +__nv_rsqrtf.exit30: ; preds = %281, %283 + %.0.i29 = phi float [ %282, %281 ], [ %284, %283 ], !dbg !48 + %285 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48 + %.not.i31 = icmp eq i32 %285, 0, !dbg !48 + br i1 %.not.i31, label %288, label %286, !dbg !48 + +286: ; preds = %__nv_rsqrtf.exit30 + %287 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %273), !dbg !48 + br label %__nv_rsqrtf.exit33, !dbg !48 + +288: ; preds = %__nv_rsqrtf.exit30 + %289 = tail call float @llvm.nvvm.rsqrt.approx.f(float %273), !dbg !48 + br label %__nv_rsqrtf.exit33, !dbg !48 + +__nv_rsqrtf.exit33: ; preds = %286, %288 + %.0.i32 = phi float [ %287, %286 ], [ %289, %288 ], !dbg !48 + %290 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48 + %.not.i34 = icmp eq i32 %290, 0, !dbg !48 + br i1 %.not.i34, label %293, label %291, !dbg !48 + +291: ; preds = %__nv_rsqrtf.exit33 + %292 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %274), !dbg !48 + br label %__nv_rsqrtf.exit36, !dbg !48 + +293: ; preds = %__nv_rsqrtf.exit33 + %294 = tail call float @llvm.nvvm.rsqrt.approx.f(float %274), !dbg !48 + br label %__nv_rsqrtf.exit36, !dbg !48 + +__nv_rsqrtf.exit36: ; preds = %291, %293 + %.0.i35 = phi float [ %292, %291 ], [ %294, %293 ], !dbg !48 + %295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48 + %.not.i37 = icmp eq i32 %295, 0, !dbg !48 + br i1 %.not.i37, label %298, label %296, !dbg !48 + +296: ; preds = %__nv_rsqrtf.exit36 + %297 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %275), !dbg !48 + br label %__nv_rsqrtf.exit39, !dbg !48 + +298: ; preds = %__nv_rsqrtf.exit36 + %299 = tail call float @llvm.nvvm.rsqrt.approx.f(float %275), !dbg !48 + br label %__nv_rsqrtf.exit39, !dbg !48 + +__nv_rsqrtf.exit39: ; preds = %296, %298 + %.0.i38 = phi float [ %297, %296 ], [ %299, %298 ], !dbg !48 + %300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48 + %.not.i40 = icmp eq i32 %300, 0, !dbg !48 + br i1 %.not.i40, label %303, label %301, !dbg !48 + +301: ; preds = %__nv_rsqrtf.exit39 + %302 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %276), !dbg !48 + br label %__nv_rsqrtf.exit42, !dbg !48 + +303: ; preds = %__nv_rsqrtf.exit39 + %304 = tail call float @llvm.nvvm.rsqrt.approx.f(float %276), !dbg !48 + br label %__nv_rsqrtf.exit42, !dbg !48 + +__nv_rsqrtf.exit42: ; preds = %301, %303 + %.0.i41 = phi float [ %302, %301 ], [ %304, %303 ], !dbg !48 + %305 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48 + %.not.i43 = icmp eq i32 %305, 0, !dbg !48 + br i1 %.not.i43, label %308, label %306, !dbg !48 + +306: ; preds = %__nv_rsqrtf.exit42 + %307 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %277), !dbg !48 + br label %__nv_rsqrtf.exit45, !dbg !48 + +308: ; preds = %__nv_rsqrtf.exit42 + %309 = tail call float @llvm.nvvm.rsqrt.approx.f(float %277), !dbg !48 + br label %__nv_rsqrtf.exit45, !dbg !48 + +__nv_rsqrtf.exit45: ; preds = %306, %308 + %.0.i44 = phi float [ %307, %306 ], [ %309, %308 ], !dbg !48 + %310 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48 + %.not.i46 = icmp eq i32 %310, 0, !dbg !48 + br i1 %.not.i46, label %313, label %311, !dbg !48 + +311: ; preds = %__nv_rsqrtf.exit45 + %312 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %278), !dbg !48 + br label %__nv_rsqrtf.exit48, !dbg !48 + +313: ; preds = %__nv_rsqrtf.exit45 + %314 = tail call float @llvm.nvvm.rsqrt.approx.f(float %278), !dbg !48 + br label %__nv_rsqrtf.exit48, !dbg !48 + +__nv_rsqrtf.exit48: ; preds = %311, %313 + %.0.i47 = phi float [ %312, %311 ], [ %314, %313 ], !dbg !48 + %315 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48 + %.not.i49 = icmp eq i32 %315, 0, !dbg !48 + br i1 %.not.i49, label %318, label %316, !dbg !48 + +316: ; preds = %__nv_rsqrtf.exit48 + %317 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %279), !dbg !48 + br label %__nv_rsqrtf.exit51, !dbg !48 + +318: ; preds = %__nv_rsqrtf.exit48 + %319 = tail call float @llvm.nvvm.rsqrt.approx.f(float %279), !dbg !48 + br label %__nv_rsqrtf.exit51, !dbg !48 + +__nv_rsqrtf.exit51: ; preds = %316, %318 + %.0.i50 = phi float [ %317, %316 ], [ %319, %318 ], !dbg !48 + %320 = icmp slt i32 %25, 8192, !dbg !20 + %321 = insertelement <2 x i16> poison, i16 %227, i64 0, !dbg !41 + %322 = insertelement <2 x i16> %321, i16 %235, i64 1, !dbg !41 + %323 = bitcast <2 x i16> %322 to <2 x bfloat>, !dbg !41 + %324 = insertelement <2 x i16> poison, i16 %225, i64 0, !dbg !41 + %325 = insertelement <2 x i16> %324, i16 %233, i64 1, !dbg !41 + %326 = bitcast <2 x i16> %325 to <2 x bfloat>, !dbg !41 + %327 = insertelement <2 x i16> poison, i16 %223, i64 0, !dbg !41 + %328 = insertelement <2 x i16> %327, i16 %231, i64 1, !dbg !41 + %329 = bitcast <2 x i16> %328 to <2 x bfloat>, !dbg !41 + %330 = insertelement <2 x i16> poison, i16 %221, i64 0, !dbg !41 + %331 = insertelement <2 x i16> %330, i16 %229, i64 1, !dbg !41 + %332 = bitcast <2 x i16> %331 to <2 x bfloat>, !dbg !41 + %333 = insertelement <2 x i16> poison, i16 %79, i64 0, !dbg !28 + %334 = insertelement <2 x i16> %333, i16 %87, i64 1, !dbg !28 + %335 = bitcast <2 x i16> %334 to <2 x bfloat>, !dbg !28 + %336 = insertelement <2 x i16> poison, i16 %177, i64 0, !dbg !35 + %337 = insertelement <2 x i16> %336, i16 %185, i64 1, !dbg !35 + %338 = bitcast <2 x i16> %337 to <2 x bfloat>, !dbg !35 + %339 = insertelement <2 x i16> poison, i16 %77, i64 0, !dbg !28 + %340 = insertelement <2 x i16> %339, i16 %85, i64 1, !dbg !28 + %341 = bitcast <2 x i16> %340 to <2 x bfloat>, !dbg !28 + %342 = insertelement <2 x i16> poison, i16 %175, i64 0, !dbg !35 + %343 = insertelement <2 x i16> %342, i16 %183, i64 1, !dbg !35 + %344 = bitcast <2 x i16> %343 to <2 x bfloat>, !dbg !35 + %345 = insertelement <2 x i16> poison, i16 %75, i64 0, !dbg !28 + %346 = insertelement <2 x i16> %345, i16 %83, i64 1, !dbg !28 + %347 = bitcast <2 x i16> %346 to <2 x bfloat>, !dbg !28 + %348 = insertelement <2 x i16> poison, i16 %173, i64 0, !dbg !35 + %349 = insertelement <2 x i16> %348, i16 %181, i64 1, !dbg !35 + %350 = bitcast <2 x i16> %349 to <2 x bfloat>, !dbg !35 + %351 = insertelement <2 x i16> poison, i16 %73, i64 0, !dbg !28 + %352 = insertelement <2 x i16> %351, i16 %81, i64 1, !dbg !28 + %353 = bitcast <2 x i16> %352 to <2 x bfloat>, !dbg !28 + %354 = insertelement <2 x i16> poison, i16 %171, i64 0, !dbg !35 + %355 = insertelement <2 x i16> %354, i16 %179, i64 1, !dbg !35 + %356 = bitcast <2 x i16> %355 to <2 x bfloat>, !dbg !35 + %357 = or disjoint i32 %17, %19, !dbg !15 + %358 = icmp slt i32 %357, 73728, !dbg !49 + %359 = or i32 %18, 896, !dbg !14 + %360 = or disjoint i32 %17, %359, !dbg !15 + %361 = or disjoint i32 %19, 768, !dbg !14 + %362 = or disjoint i32 %17, %361, !dbg !15 + %363 = or disjoint i32 %19, 640, !dbg !14 + %364 = or disjoint i32 %17, %363, !dbg !15 + %365 = or disjoint i32 %19, 512, !dbg !14 + %366 = or disjoint i32 %17, %365, !dbg !15 + %367 = or disjoint i32 %19, 384, !dbg !14 + %368 = or disjoint i32 %17, %367, !dbg !15 + %369 = or disjoint i32 %19, 256, !dbg !14 + %370 = or disjoint i32 %17, %369, !dbg !15 + %371 = or disjoint i32 %19, 128, !dbg !14 + %372 = or disjoint i32 %17, %371, !dbg !15 + %373 = getelementptr bfloat, ptr addrspace(1) %5, i64 %168, !dbg !50 + %374 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %375 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %374, i1 %216) #6, !dbg !51 + %376 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %377 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %376, i1 %216) #6, !dbg !51 + %378 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %379 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %378, i1 %216) #6, !dbg !51 + %380 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %381 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %380, i1 %216) #6, !dbg !51 + %382 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %383 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %382, i1 %219) #6, !dbg !51 + %384 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %385 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %384, i1 %219) #6, !dbg !51 + %386 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %387 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %386, i1 %219) #6, !dbg !51 + %388 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %389 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %388, i1 %219) #6, !dbg !51 + %390 = shl i32 %357, 7, !dbg !52 + %391 = shl i32 %372, 7, !dbg !52 + %392 = shl i32 %370, 7, !dbg !52 + %393 = shl i32 %368, 7, !dbg !52 + %394 = shl i32 %366, 7, !dbg !52 + %395 = shl i32 %364, 7, !dbg !52 + %396 = shl i32 %362, 7, !dbg !52 + %397 = shl i32 %360, 7, !dbg !52 + %398 = add i32 %390, %26, !dbg !53 + %399 = add i32 %391, %26, !dbg !53 + %400 = add i32 %392, %26, !dbg !53 + %401 = add i32 %393, %26, !dbg !53 + %402 = add i32 %394, %26, !dbg !53 + %403 = add i32 %395, %26, !dbg !53 + %404 = add i32 %396, %26, !dbg !53 + %405 = add i32 %397, %26, !dbg !53 + %406 = sext i32 %398 to i64, !dbg !54 + %407 = getelementptr bfloat, ptr addrspace(1) %6, i64 %406, !dbg !54 + %408 = sext i32 %399 to i64, !dbg !54 + %409 = getelementptr bfloat, ptr addrspace(1) %6, i64 %408, !dbg !54 + %410 = sext i32 %400 to i64, !dbg !54 + %411 = getelementptr bfloat, ptr addrspace(1) %6, i64 %410, !dbg !54 + %412 = sext i32 %401 to i64, !dbg !54 + %413 = getelementptr bfloat, ptr addrspace(1) %6, i64 %412, !dbg !54 + %414 = sext i32 %402 to i64, !dbg !54 + %415 = getelementptr bfloat, ptr addrspace(1) %6, i64 %414, !dbg !54 + %416 = sext i32 %403 to i64, !dbg !54 + %417 = getelementptr bfloat, ptr addrspace(1) %6, i64 %416, !dbg !54 + %418 = sext i32 %404 to i64, !dbg !54 + %419 = getelementptr bfloat, ptr addrspace(1) %6, i64 %418, !dbg !54 + %420 = sext i32 %405 to i64, !dbg !54 + %421 = getelementptr bfloat, ptr addrspace(1) %6, i64 %420, !dbg !54 + %422 = and i1 %27, %358, !dbg !55 + %423 = fpext <2 x bfloat> %332 to <2 x float>, !dbg !56 + %424 = fpext <2 x bfloat> %353 to <2 x float>, !dbg !57 + %425 = insertelement <2 x float> poison, float %.0.i, i64 0, !dbg !58 + %426 = insertelement <2 x float> %425, float %.0.i17, i64 1, !dbg !58 + %427 = fmul <2 x float> %426, %424, !dbg !58 + %428 = fpext <2 x bfloat> %356 to <2 x float>, !dbg !59 + %429 = fmul <2 x float> %427, %428, !dbg !60 + %430 = insertelement <2 x float> poison, float %.0.i29, i64 0, !dbg !61 + %431 = insertelement <2 x float> %430, float %.0.i41, i64 1, !dbg !61 + %432 = fmul <2 x float> %431, %423, !dbg !61 + %433 = insertelement <2 x i16> poison, i16 %375, i64 0, !dbg !51 + %434 = insertelement <2 x i16> %433, i16 %383, i64 1, !dbg !51 + %435 = bitcast <2 x i16> %434 to <2 x bfloat>, !dbg !51 + %436 = fpext <2 x bfloat> %435 to <2 x float>, !dbg !62 + %437 = fmul <2 x float> %432, %436, !dbg !63 + %438 = insertelement <2 x i1> poison, i1 %34, i64 0, !dbg !64 + %439 = insertelement <2 x i1> %438, i1 %320, i64 1, !dbg !64 + %440 = select <2 x i1> %439, <2 x float> %429, <2 x float> %437, !dbg !64 + %441 = fptrunc <2 x float> %440 to <2 x bfloat>, !dbg !65 + %442 = fpext <2 x bfloat> %329 to <2 x float>, !dbg !56 + %443 = fpext <2 x bfloat> %347 to <2 x float>, !dbg !57 + %444 = insertelement <2 x float> poison, float %.0.i8, i64 0, !dbg !58 + %445 = insertelement <2 x float> %444, float %.0.i20, i64 1, !dbg !58 + %446 = fmul <2 x float> %445, %443, !dbg !58 + %447 = fpext <2 x bfloat> %350 to <2 x float>, !dbg !59 + %448 = fmul <2 x float> %446, %447, !dbg !60 + %449 = insertelement <2 x float> poison, float %.0.i32, i64 0, !dbg !61 + %450 = insertelement <2 x float> %449, float %.0.i44, i64 1, !dbg !61 + %451 = fmul <2 x float> %450, %442, !dbg !61 + %452 = insertelement <2 x i16> poison, i16 %377, i64 0, !dbg !51 + %453 = insertelement <2 x i16> %452, i16 %385, i64 1, !dbg !51 + %454 = bitcast <2 x i16> %453 to <2 x bfloat>, !dbg !51 + %455 = fpext <2 x bfloat> %454 to <2 x float>, !dbg !62 + %456 = fmul <2 x float> %451, %455, !dbg !63 + %457 = select <2 x i1> %439, <2 x float> %448, <2 x float> %456, !dbg !64 + %458 = fptrunc <2 x float> %457 to <2 x bfloat>, !dbg !65 + %459 = fpext <2 x bfloat> %326 to <2 x float>, !dbg !56 + %460 = fpext <2 x bfloat> %341 to <2 x float>, !dbg !57 + %461 = insertelement <2 x float> poison, float %.0.i11, i64 0, !dbg !58 + %462 = insertelement <2 x float> %461, float %.0.i23, i64 1, !dbg !58 + %463 = fmul <2 x float> %462, %460, !dbg !58 + %464 = fpext <2 x bfloat> %344 to <2 x float>, !dbg !59 + %465 = fmul <2 x float> %463, %464, !dbg !60 + %466 = insertelement <2 x float> poison, float %.0.i35, i64 0, !dbg !61 + %467 = insertelement <2 x float> %466, float %.0.i47, i64 1, !dbg !61 + %468 = fmul <2 x float> %467, %459, !dbg !61 + %469 = insertelement <2 x i16> poison, i16 %379, i64 0, !dbg !51 + %470 = insertelement <2 x i16> %469, i16 %387, i64 1, !dbg !51 + %471 = bitcast <2 x i16> %470 to <2 x bfloat>, !dbg !51 + %472 = fpext <2 x bfloat> %471 to <2 x float>, !dbg !62 + %473 = fmul <2 x float> %468, %472, !dbg !63 + %474 = select <2 x i1> %439, <2 x float> %465, <2 x float> %473, !dbg !64 + %475 = fptrunc <2 x float> %474 to <2 x bfloat>, !dbg !65 + %476 = fpext <2 x bfloat> %323 to <2 x float>, !dbg !56 + %477 = fpext <2 x bfloat> %335 to <2 x float>, !dbg !57 + %478 = insertelement <2 x float> poison, float %.0.i14, i64 0, !dbg !58 + %479 = insertelement <2 x float> %478, float %.0.i26, i64 1, !dbg !58 + %480 = fmul <2 x float> %479, %477, !dbg !58 + %481 = fpext <2 x bfloat> %338 to <2 x float>, !dbg !59 + %482 = fmul <2 x float> %480, %481, !dbg !60 + %483 = insertelement <2 x float> poison, float %.0.i38, i64 0, !dbg !61 + %484 = insertelement <2 x float> %483, float %.0.i50, i64 1, !dbg !61 + %485 = fmul <2 x float> %484, %476, !dbg !61 + %486 = insertelement <2 x i16> poison, i16 %381, i64 0, !dbg !51 + %487 = insertelement <2 x i16> %486, i16 %389, i64 1, !dbg !51 + %488 = bitcast <2 x i16> %487 to <2 x bfloat>, !dbg !51 + %489 = fpext <2 x bfloat> %488 to <2 x float>, !dbg !62 + %490 = fmul <2 x float> %485, %489, !dbg !63 + %491 = select <2 x i1> %439, <2 x float> %482, <2 x float> %490, !dbg !64 + %492 = fptrunc <2 x float> %491 to <2 x bfloat>, !dbg !65 + %493 = shl nuw nsw i32 %19, 4, !dbg !65 + %494 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %493, !dbg !65 + %495 = bitcast <2 x bfloat> %441 to i32, !dbg !65 + %496 = bitcast <2 x bfloat> %458 to i32, !dbg !65 + %497 = bitcast <2 x bfloat> %475 to i32, !dbg !65 + %498 = bitcast <2 x bfloat> %492 to i32, !dbg !65 + %499 = insertelement <4 x i32> poison, i32 %495, i64 0, !dbg !65 + %500 = insertelement <4 x i32> %499, i32 %496, i64 1, !dbg !65 + %501 = insertelement <4 x i32> %500, i32 %497, i64 2, !dbg !65 + %502 = insertelement <4 x i32> %501, i32 %498, i64 3, !dbg !65 + store <4 x i32> %502, ptr addrspace(3) %494, align 16, !dbg !65 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !65 + %503 = shl nuw nsw i32 %18, 6, !dbg !65 + %504 = and i32 %503, 1536, !dbg !65 + %505 = shl nuw nsw i32 %18, 4, !dbg !65 + %506 = and i32 %505, 112, !dbg !65 + %507 = shl nuw nsw i32 %18, 2, !dbg !65 + %508 = and i32 %507, 384, !dbg !65 + %509 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %504, !dbg !65 + %510 = getelementptr inbounds nuw i8, ptr addrspace(3) %509, i32 %506, !dbg !65 + %511 = getelementptr inbounds nuw i8, ptr addrspace(3) %510, i32 %508, !dbg !65 + %512 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %511), !dbg !65 + %513 = extractvalue { i32, i32, i32, i32 } %512, 0, !dbg !65 + %extelt.offset = lshr i32 %513, 16, !dbg !65 + %514 = trunc nuw i32 %extelt.offset to i16, !dbg !65 + %515 = extractvalue { i32, i32, i32, i32 } %512, 1, !dbg !65 + %extelt.offset2 = lshr i32 %515, 16, !dbg !65 + %516 = trunc nuw i32 %extelt.offset2 to i16, !dbg !65 + %517 = extractvalue { i32, i32, i32, i32 } %512, 2, !dbg !65 + %extelt.offset4 = lshr i32 %517, 16, !dbg !65 + %518 = trunc nuw i32 %extelt.offset4 to i16, !dbg !65 + %519 = extractvalue { i32, i32, i32, i32 } %512, 3, !dbg !65 + %extelt.offset6 = lshr i32 %519, 16, !dbg !65 + %520 = trunc nuw i32 %extelt.offset6 to i16, !dbg !65 + %.extract = trunc i32 %513 to i16, !dbg !65 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %.extract, ptr addrspace(1) %407, i1 %422) #6, !dbg !65 + %.extract1 = trunc i32 %515 to i16, !dbg !65 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %.extract1, ptr addrspace(1) %409, i1 %422) #6, !dbg !65 + %.extract3 = trunc i32 %517 to i16, !dbg !65 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %.extract3, ptr addrspace(1) %411, i1 %422) #6, !dbg !65 + %.extract5 = trunc i32 %519 to i16, !dbg !65 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %.extract5, ptr addrspace(1) %413, i1 %422) #6, !dbg !65 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %514, ptr addrspace(1) %415, i1 %422) #6, !dbg !65 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %516, ptr addrspace(1) %417, i1 %422) #6, !dbg !65 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %518, ptr addrspace(1) %419, i1 %422) #6, !dbg !65 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %520, ptr addrspace(1) %421, i1 %422) #6, !dbg !65 + ret void, !dbg !66 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nocallback nofree nounwind memory(argmem: read) +declare { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) readonly captures(none)) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nocallback nofree nounwind memory(argmem: read) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 29, scope: !5) +!9 = !DILocation(line: 21, column: 48, scope: !5) +!10 = !DILocation(line: 21, column: 69, scope: !5) +!11 = !DILocation(line: 21, column: 53, scope: !5) +!12 = !DILocation(line: 21, column: 34, scope: !5) +!13 = !DILocation(line: 21, column: 75, scope: !5) +!14 = !DILocation(line: 22, column: 44, scope: !5) +!15 = !DILocation(line: 22, column: 23, scope: !5) +!16 = !DILocation(line: 24, column: 28, scope: !5) +!17 = !DILocation(line: 26, column: 21, scope: !5) +!18 = !DILocation(line: 27, column: 19, scope: !5) +!19 = !DILocation(line: 29, column: 19, scope: !5) +!20 = !DILocation(line: 35, column: 18, scope: !5) +!21 = !DILocation(line: 36, column: 39, scope: !5) +!22 = !DILocation(line: 36, column: 35, scope: !5) +!23 = !DILocation(line: 36, column: 51, scope: !5) +!24 = !DILocation(line: 36, column: 44, scope: !5) +!25 = !DILocation(line: 36, column: 30, scope: !5) +!26 = !DILocation(line: 36, column: 64, scope: !5) +!27 = !DILocation(line: 36, column: 72, scope: !5) +!28 = !DILocation(line: 36, column: 57, scope: !5) +!29 = !DILocation(line: 38, column: 30, scope: !5) +!30 = !DILocation(line: 38, column: 80, scope: !5) +!31 = !DILocation(line: 40, column: 19, scope: !5) +!32 = !DILocation(line: 42, column: 19, scope: !5) +!33 = !DILocation(line: 43, column: 28, scope: !5) +!34 = !DILocation(line: 45, column: 31, scope: !5) +!35 = !DILocation(line: 45, column: 71, scope: !5) +!36 = !DILocation(line: 54, column: 61, scope: !5) +!37 = !DILocation(line: 54, column: 52, scope: !5) +!38 = !DILocation(line: 54, column: 45, scope: !5) +!39 = !DILocation(line: 54, column: 31, scope: !5) +!40 = !DILocation(line: 54, column: 83, scope: !5) +!41 = !DILocation(line: 54, column: 67, scope: !5) +!42 = !DILocation(line: 56, column: 56, scope: !5) +!43 = !DILocation(line: 56, column: 52, scope: !5) +!44 = !DILocation(line: 56, column: 31, scope: !5) +!45 = !DILocation(line: 56, column: 90, scope: !5) +!46 = !DILocation(line: 58, column: 21, scope: !5) +!47 = !DILocation(line: 60, column: 20, scope: !5) +!48 = !DILocation(line: 61, column: 28, scope: !5) +!49 = !DILocation(line: 23, column: 21, scope: !5) +!50 = !DILocation(line: 63, column: 31, scope: !5) +!51 = !DILocation(line: 63, column: 71, scope: !5) +!52 = !DILocation(line: 70, column: 34, scope: !5) +!53 = !DILocation(line: 70, column: 30, scope: !5) +!54 = !DILocation(line: 70, column: 25, scope: !5) +!55 = !DILocation(line: 70, column: 54, scope: !5) +!56 = !DILocation(line: 54, column: 134, scope: !5) +!57 = !DILocation(line: 36, column: 123, scope: !5) +!58 = !DILocation(line: 44, column: 19, scope: !5) +!59 = !DILocation(line: 45, column: 137, scope: !5) +!60 = !DILocation(line: 47, column: 20, scope: !5) +!61 = !DILocation(line: 62, column: 20, scope: !5) +!62 = !DILocation(line: 63, column: 138, scope: !5) +!63 = !DILocation(line: 65, column: 20, scope: !5) +!64 = !DILocation(line: 0, scope: !5) +!65 = !DILocation(line: 70, column: 46, scope: !5) +!66 = !DILocation(line: 70, column: 4, scope: !5) diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..3a9cade9cf109074e60b25f8870efa05ea116469 --- /dev/null +++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx @@ -0,0 +1,1027 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_poi_fused__fused_rms_norm_cat_view_2 +.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2( + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10 +) +.reqntid 128 +{ + .reg .pred %p<13>; + .reg .b16 %rs<74>; + .reg .b32 %r<253>; + .reg .b64 %rd<75>; + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd67, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0]; + ld.param.b64 %rd68, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1]; +$L__tmp0: + .loc 1 21 29 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29 + mov.u32 %r18, %ctaid.y; + ld.param.b64 %rd69, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2]; + .loc 1 21 48 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48 + mov.u32 %r19, %ctaid.z; + ld.param.b64 %rd70, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3]; + .loc 1 21 69 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69 + mov.u32 %r20, %nctaid.y; + ld.param.b64 %rd71, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4]; + .loc 1 21 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34 + mad.lo.s32 %r21, %r19, %r20, %r18; + ld.param.b64 %rd72, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5]; + .loc 1 21 75 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75 + shl.b32 %r22, %r21, 10; + ld.param.b64 %rd73, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6]; + .loc 1 22 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44 + mov.u32 %r23, %tid.x; + and.b32 %r24, %r23, 127; + shl.b32 %r25, %r24, 2; + .loc 1 22 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23 + or.b32 %r26, %r22, %r25; + or.b32 %r27, %r26, 1; + or.b32 %r28, %r26, 2; + or.b32 %r29, %r26, 3; + or.b32 %r30, %r26, 512; + .loc 1 24 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28 + mov.u32 %r31, %ctaid.x; + .loc 1 26 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21 + setp.lt.u32 %p6, %r31, 128; + .loc 1 27 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19 + bfe.s32 %r32, %r21, 21, 1; + .loc 1 29 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19 + shr.u32 %r33, %r32, 27; + .loc 1 27 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19 + add.s32 %r34, %r26, %r33; + shr.u32 %r35, %r34, 5; + add.s32 %r36, %r30, %r33; + shr.u32 %r37, %r36, 5; + .loc 1 29 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19 + and.b32 %r38, %r34, -32; + sub.s32 %r39, %r26, %r38; + add.s32 %r40, %r27, %r33; + and.b32 %r41, %r40, 33554400; + sub.s32 %r42, %r27, %r41; + add.s32 %r43, %r28, %r33; + and.b32 %r44, %r43, 33554400; + sub.s32 %r45, %r28, %r44; + add.s32 %r46, %r29, %r33; + and.b32 %r47, %r46, 33554400; + sub.s32 %r48, %r29, %r47; + .loc 1 35 18 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18 + setp.lt.s32 %p7, %r26, 8192; + .loc 1 36 39 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39 + shl.b32 %r49, %r39, 7; + shl.b32 %r50, %r42, 7; + shl.b32 %r51, %r45, 7; + shl.b32 %r52, %r48, 7; + .loc 1 36 35 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35 + add.s32 %r53, %r49, %r31; + add.s32 %r54, %r50, %r31; + add.s32 %r55, %r51, %r31; + add.s32 %r56, %r52, %r31; + .loc 1 36 51 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:51 + mul.lo.s32 %r57, %r35, 12288; + mul.lo.s32 %r58, %r37, 12288; + .loc 1 36 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44 + add.s32 %r59, %r53, %r57; + add.s32 %r60, %r54, %r57; + add.s32 %r61, %r55, %r57; + add.s32 %r62, %r56, %r57; + add.s32 %r63, %r53, %r58; + add.s32 %r64, %r54, %r58; + add.s32 %r65, %r55, %r58; + add.s32 %r66, %r56, %r58; + .loc 1 36 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30 + mad.wide.s32 %rd1, %r59, 2, %rd67; + mad.wide.s32 %rd3, %r60, 2, %rd67; + mad.wide.s32 %rd5, %r61, 2, %rd67; + mad.wide.s32 %rd7, %r62, 2, %rd67; + mad.wide.s32 %rd9, %r63, 2, %rd67; + mad.wide.s32 %rd11, %r64, 2, %rd67; + mad.wide.s32 %rd13, %r65, 2, %rd67; + mad.wide.s32 %rd15, %r66, 2, %rd67; + .loc 1 36 64 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64 + and.pred %p1, %p6, %p7; + .loc 1 36 72 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:72 + setp.lt.s32 %p8, %r26, 7680; + and.pred %p2, %p6, %p8; + .loc 1 36 57 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b16 %rs2, 0; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd1 + 0 ], %rd2; + // end inline asm + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd3 + 0 ], %rd4; + // end inline asm + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs4, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd5 + 0 ], %rd6; + // end inline asm + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd7 + 0 ], %rd8; + // end inline asm + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs6, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd9 + 0 ], %rd10; + // end inline asm + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs7, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd11 + 0 ], %rd12; + // end inline asm + // begin inline asm + mov.u64 %rd14, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs8, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd13 + 0 ], %rd14; + // end inline asm + // begin inline asm + mov.u64 %rd16, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs9, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd15 + 0 ], %rd16; + // end inline asm + .loc 1 38 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30 + mad.wide.s32 %rd17, %r26, 4, %rd68; + add.s64 %rd19, %rd17, 2048; + .loc 1 38 80 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd17 + 0 ], %rd18; + // end inline asm + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd19 + 0 ], %rd20; + // end inline asm + mov.b32 %r67, 0f43000000; + .loc 1 40 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19 + div.full.f32 %r68, %r1, %r67; + div.full.f32 %r69, %r2, %r67; + div.full.f32 %r70, %r3, %r67; + div.full.f32 %r71, %r4, %r67; + div.full.f32 %r72, %r6, %r67; + div.full.f32 %r73, %r7, %r67; + div.full.f32 %r74, %r8, %r67; + div.full.f32 %r75, %r9, %r67; + .loc 1 42 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19 + add.f32 %r76, %r68, 0f358637BD; + add.f32 %r77, %r69, 0f358637BD; + add.f32 %r78, %r70, 0f358637BD; + add.f32 %r79, %r71, 0f358637BD; + add.f32 %r80, %r72, 0f358637BD; + add.f32 %r81, %r73, 0f358637BD; + add.f32 %r82, %r74, 0f358637BD; + add.f32 %r83, %r75, 0f358637BD; + .loc 1 43 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28 + rsqrt.approx.ftz.f32 %r84, %r76; + rsqrt.approx.ftz.f32 %r85, %r77; + rsqrt.approx.ftz.f32 %r86, %r78; + rsqrt.approx.ftz.f32 %r87, %r79; + rsqrt.approx.ftz.f32 %r88, %r80; + rsqrt.approx.ftz.f32 %r89, %r81; + rsqrt.approx.ftz.f32 %r90, %r82; + rsqrt.approx.ftz.f32 %r91, %r83; + .loc 1 45 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31 + mul.wide.u32 %rd74, %r31, 2; + add.s64 %rd21, %rd69, %rd74; + .loc 1 45 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71 + // begin inline asm + mov.u64 %rd22, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd22, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs10, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd21 + 0 ], %rd22; + // end inline asm + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd21 + 0 ], %rd23; + // end inline asm + // begin inline asm + mov.u64 %rd24, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd21 + 0 ], %rd24; + // end inline asm + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd21 + 0 ], %rd25; + // end inline asm + // begin inline asm + mov.u64 %rd26, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd26, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs14, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd21 + 0 ], %rd26; + // end inline asm + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd21 + 0 ], %rd27; + // end inline asm + // begin inline asm + mov.u64 %rd28, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd28, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs16, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd21 + 0 ], %rd28; + // end inline asm + // begin inline asm + mov.u64 %rd29, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd21 + 0 ], %rd29; + // end inline asm + .loc 1 54 61 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:61 + and.b32 %r92, %r36, -32; + .loc 1 54 52 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:52 + add.s32 %r93, %r57, -3145728; + add.s32 %r94, %r58, -3145728; + .loc 1 54 45 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45 + add.s32 %r95, %r53, %r93; + add.s32 %r96, %r54, %r93; + add.s32 %r97, %r55, %r93; + add.s32 %r98, %r56, %r93; + add.s32 %r99, %r53, %r94; + add.s32 %r100, %r54, %r94; + add.s32 %r101, %r55, %r94; + add.s32 %r102, %r56, %r94; + .loc 1 54 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31 + mad.wide.s32 %rd30, %r95, 2, %rd70; + mad.wide.s32 %rd32, %r96, 2, %rd70; + mad.wide.s32 %rd34, %r97, 2, %rd70; + mad.wide.s32 %rd36, %r98, 2, %rd70; + mad.wide.s32 %rd38, %r99, 2, %rd70; + mad.wide.s32 %rd40, %r100, 2, %rd70; + mad.wide.s32 %rd42, %r101, 2, %rd70; + mad.wide.s32 %rd44, %r102, 2, %rd70; + .loc 1 54 83 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83 + add.s32 %r103, %r22, -8192; + setp.lt.u32 %p9, %r103, 65536; + and.pred %p3, %p6, %p9; + add.s32 %r104, %r22, -7680; + setp.lt.u32 %p10, %r104, 66048; + and.pred %p4, %p6, %p10; + .loc 1 54 67 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs18, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd30 + 0 ], %rd31; + // end inline asm + // begin inline asm + mov.u64 %rd33, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd32 + 0 ], %rd33; + // end inline asm + // begin inline asm + mov.u64 %rd35, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs20, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd34 + 0 ], %rd35; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd36 + 0 ], %rd37; + // end inline asm + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs22, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd38 + 0 ], %rd39; + // end inline asm + // begin inline asm + mov.u64 %rd41, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd40 + 0 ], %rd41; + // end inline asm + // begin inline asm + mov.u64 %rd43, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs24, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd42 + 0 ], %rd43; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs25, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd44 + 0 ], %rd45; + // end inline asm + .loc 1 56 56 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:56 + add.s32 %r105, %r92, %r39; + .loc 1 56 52 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52 + add.s32 %r106, %r26, -8192; + add.s32 %r107, %r105, -8192; + .loc 1 56 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31 + mad.wide.s32 %rd46, %r106, 4, %rd71; + mad.wide.s32 %rd48, %r107, 4, %rd71; + .loc 1 56 90 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90 + // begin inline asm + mov.u64 %rd47, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + mov.u32 %r13, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd46 + 0 ], %rd47; + // end inline asm + // begin inline asm + mov.u64 %rd49, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r14, %r5; + mov.u32 %r15, %r5; + mov.u32 %r16, %r5; + mov.u32 %r17, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd48 + 0 ], %rd49; + // end inline asm + .loc 1 58 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21 + div.full.f32 %r108, %r10, %r67; + div.full.f32 %r109, %r11, %r67; + div.full.f32 %r110, %r12, %r67; + div.full.f32 %r111, %r13, %r67; + div.full.f32 %r112, %r14, %r67; + div.full.f32 %r113, %r15, %r67; + div.full.f32 %r114, %r16, %r67; + div.full.f32 %r115, %r17, %r67; + .loc 1 60 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20 + add.f32 %r116, %r108, 0f358637BD; + add.f32 %r117, %r109, 0f358637BD; + add.f32 %r118, %r110, 0f358637BD; + add.f32 %r119, %r111, 0f358637BD; + add.f32 %r120, %r112, 0f358637BD; + add.f32 %r121, %r113, 0f358637BD; + add.f32 %r122, %r114, 0f358637BD; + add.f32 %r123, %r115, 0f358637BD; + .loc 1 61 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28 + rsqrt.approx.ftz.f32 %r124, %r116; + rsqrt.approx.ftz.f32 %r125, %r117; + rsqrt.approx.ftz.f32 %r126, %r118; + rsqrt.approx.ftz.f32 %r127, %r119; + rsqrt.approx.ftz.f32 %r128, %r120; + rsqrt.approx.ftz.f32 %r129, %r121; + rsqrt.approx.ftz.f32 %r130, %r122; + rsqrt.approx.ftz.f32 %r131, %r123; + .loc 1 35 18 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18 + setp.lt.s32 %p11, %r30, 8192; + .loc 1 54 67 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67 + mov.b32 %r132, {%rs21, %rs25}; + mov.b32 %r133, {%rs20, %rs24}; + mov.b32 %r134, {%rs19, %rs23}; + mov.b32 %r135, {%rs18, %rs22}; + .loc 1 36 57 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57 + mov.b32 %r136, {%rs5, %rs9}; + .loc 1 45 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71 + mov.b32 %r137, {%rs13, %rs17}; + .loc 1 36 57 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57 + mov.b32 %r138, {%rs4, %rs8}; + .loc 1 45 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71 + mov.b32 %r139, {%rs12, %rs16}; + .loc 1 36 57 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57 + mov.b32 %r140, {%rs3, %rs7}; + .loc 1 45 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71 + mov.b32 %r141, {%rs11, %rs15}; + .loc 1 36 57 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57 + mov.b32 %r142, {%rs1, %rs6}; + .loc 1 45 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71 + mov.b32 %r143, {%rs10, %rs14}; + .loc 1 22 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23 + or.b32 %r144, %r22, %r24; + .loc 1 23 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21 + setp.lt.s32 %p12, %r144, 73728; + .loc 1 22 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44 + or.b32 %r145, %r23, %r22; + .loc 1 22 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23 + shl.b32 %r146, %r145, 7; + .loc 1 63 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31 + add.s64 %rd50, %rd72, %rd74; + .loc 1 63 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71 + // begin inline asm + mov.u64 %rd51, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs26, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd50 + 0 ], %rd51; + // end inline asm + // begin inline asm + mov.u64 %rd52, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd52, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd50 + 0 ], %rd52; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs28, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd50 + 0 ], %rd53; + // end inline asm + // begin inline asm + mov.u64 %rd54, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd54, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs29, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd50 + 0 ], %rd54; + // end inline asm + // begin inline asm + mov.u64 %rd55, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs30, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd50 + 0 ], %rd55; + // end inline asm + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs31, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd50 + 0 ], %rd56; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs32, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd50 + 0 ], %rd57; + // end inline asm + // begin inline asm + mov.u64 %rd58, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd58, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs33, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd50 + 0 ], %rd58; + // end inline asm + .loc 1 22 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23 + shl.b32 %r147, %r144, 7; + .loc 1 70 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34 + or.b32 %r148, %r146, 114688; + .loc 1 70 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30 + add.s32 %r149, %r147, %r31; + add.s32 %r150, %r149, 16384; + add.s32 %r151, %r149, 32768; + add.s32 %r152, %r149, 49152; + add.s32 %r153, %r149, 65536; + add.s32 %r154, %r149, 81920; + add.s32 %r155, %r149, 98304; + add.s32 %r156, %r148, %r31; + .loc 1 70 25 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25 + mad.wide.s32 %rd59, %r149, 2, %rd73; + mad.wide.s32 %rd60, %r150, 2, %rd73; + mad.wide.s32 %rd61, %r151, 2, %rd73; + mad.wide.s32 %rd62, %r152, 2, %rd73; + mad.wide.s32 %rd63, %r153, 2, %rd73; + mad.wide.s32 %rd64, %r154, 2, %rd73; + mad.wide.s32 %rd65, %r155, 2, %rd73; + mad.wide.s32 %rd66, %r156, 2, %rd73; + .loc 1 70 54 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54 + and.pred %p5, %p6, %p12; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + mov.b32 {%rs42, %rs43}, %r135; + cvt.f32.bf16 %r157, %rs43; + cvt.f32.bf16 %r158, %rs42; + .loc 1 36 123 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123 + mov.b32 {%rs44, %rs45}, %r142; + cvt.f32.bf16 %r159, %rs45; + cvt.f32.bf16 %r160, %rs44; + .loc 1 44 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19 + mul.f32 %r161, %r84, %r160; + mul.f32 %r162, %r88, %r159; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs46, %rs47}, %r143; + cvt.f32.bf16 %r163, %rs46; + cvt.f32.bf16 %r164, %rs47; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r165, %r162, %r164; + mul.f32 %r166, %r161, %r163; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r167, %r124, %r158; + mul.f32 %r168, %r128, %r157; + .loc 1 63 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71 + mov.b32 %r169, {%rs26, %rs30}; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs48, %rs49}, %r169; + cvt.f32.bf16 %r170, %rs48; + cvt.f32.bf16 %r171, %rs49; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r172, %r168, %r171; + mul.f32 %r173, %r167, %r170; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r174, %r166, %r173, %p7; + selp.f32 %r175, %r165, %r172, %p11; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r176, %r175, %r174; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + mov.b32 {%rs50, %rs51}, %r134; + cvt.f32.bf16 %r177, %rs51; + cvt.f32.bf16 %r178, %rs50; + .loc 1 36 123 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123 + mov.b32 {%rs52, %rs53}, %r140; + cvt.f32.bf16 %r179, %rs53; + cvt.f32.bf16 %r180, %rs52; + .loc 1 44 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19 + mul.f32 %r181, %r85, %r180; + mul.f32 %r182, %r89, %r179; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs54, %rs55}, %r141; + cvt.f32.bf16 %r183, %rs54; + cvt.f32.bf16 %r184, %rs55; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r185, %r182, %r184; + mul.f32 %r186, %r181, %r183; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r187, %r125, %r178; + mul.f32 %r188, %r129, %r177; + .loc 1 63 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71 + mov.b32 %r189, {%rs27, %rs31}; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs56, %rs57}, %r189; + cvt.f32.bf16 %r190, %rs56; + cvt.f32.bf16 %r191, %rs57; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r192, %r188, %r191; + mul.f32 %r193, %r187, %r190; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r194, %r186, %r193, %p7; + selp.f32 %r195, %r185, %r192, %p11; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r196, %r195, %r194; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + mov.b32 {%rs58, %rs59}, %r133; + cvt.f32.bf16 %r197, %rs59; + cvt.f32.bf16 %r198, %rs58; + .loc 1 36 123 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123 + mov.b32 {%rs60, %rs61}, %r138; + cvt.f32.bf16 %r199, %rs61; + cvt.f32.bf16 %r200, %rs60; + .loc 1 44 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19 + mul.f32 %r201, %r86, %r200; + mul.f32 %r202, %r90, %r199; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs62, %rs63}, %r139; + cvt.f32.bf16 %r203, %rs62; + cvt.f32.bf16 %r204, %rs63; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r205, %r202, %r204; + mul.f32 %r206, %r201, %r203; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r207, %r126, %r198; + mul.f32 %r208, %r130, %r197; + .loc 1 63 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71 + mov.b32 %r209, {%rs28, %rs32}; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs64, %rs65}, %r209; + cvt.f32.bf16 %r210, %rs64; + cvt.f32.bf16 %r211, %rs65; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r212, %r208, %r211; + mul.f32 %r213, %r207, %r210; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r214, %r206, %r213, %p7; + selp.f32 %r215, %r205, %r212, %p11; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r216, %r215, %r214; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + mov.b32 {%rs66, %rs67}, %r132; + cvt.f32.bf16 %r217, %rs67; + cvt.f32.bf16 %r218, %rs66; + .loc 1 36 123 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123 + mov.b32 {%rs68, %rs69}, %r136; + cvt.f32.bf16 %r219, %rs69; + cvt.f32.bf16 %r220, %rs68; + .loc 1 44 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19 + mul.f32 %r221, %r87, %r220; + mul.f32 %r222, %r91, %r219; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs70, %rs71}, %r137; + cvt.f32.bf16 %r223, %rs70; + cvt.f32.bf16 %r224, %rs71; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r225, %r222, %r224; + mul.f32 %r226, %r221, %r223; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r227, %r127, %r218; + mul.f32 %r228, %r131, %r217; + .loc 1 63 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71 + mov.b32 %r229, {%rs29, %rs33}; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs72, %rs73}, %r229; + cvt.f32.bf16 %r230, %rs72; + cvt.f32.bf16 %r231, %rs73; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r232, %r228, %r231; + mul.f32 %r233, %r227, %r230; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r234, %r226, %r233, %p7; + selp.f32 %r235, %r225, %r232, %p11; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r236, %r235, %r234; + shl.b32 %r237, %r24, 4; + mov.b32 %r238, global_smem; + add.s32 %r239, %r238, %r237; + st.shared.v4.b32 [%r239], {%r176, %r196, %r216, %r236}; + bar.sync 0; + shl.b32 %r240, %r23, 6; + and.b32 %r241, %r240, 1536; + shl.b32 %r242, %r23, 4; + and.b32 %r243, %r242, 112; + shl.b32 %r244, %r23, 2; + and.b32 %r245, %r244, 384; + add.s32 %r246, %r238, %r241; + add.s32 %r247, %r246, %r243; + add.s32 %r248, %r247, %r245; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r249, %r250, %r251, %r252}, [%r248]; + mov.b32 {_, %rs38}, %r249; + mov.b32 {_, %rs39}, %r250; + mov.b32 {_, %rs40}, %r251; + mov.b32 {_, %rs41}, %r252; + cvt.u16.u32 %rs34, %r249; + // begin inline asm + @%p5 st.global.b16 [ %rd59 + 0 ], { %rs34 }; + // end inline asm + cvt.u16.u32 %rs35, %r250; + // begin inline asm + @%p5 st.global.b16 [ %rd60 + 0 ], { %rs35 }; + // end inline asm + cvt.u16.u32 %rs36, %r251; + // begin inline asm + @%p5 st.global.b16 [ %rd61 + 0 ], { %rs36 }; + // end inline asm + cvt.u16.u32 %rs37, %r252; + // begin inline asm + @%p5 st.global.b16 [ %rd62 + 0 ], { %rs37 }; + // end inline asm + // begin inline asm + @%p5 st.global.b16 [ %rd63 + 0 ], { %rs38 }; + // end inline asm + // begin inline asm + @%p5 st.global.b16 [ %rd64 + 0 ], { %rs39 }; + // end inline asm + // begin inline asm + @%p5 st.global.b16 [ %rd65 + 0 ], { %rs40 }; + // end inline asm + // begin inline asm + @%p5 st.global.b16 [ %rd66 + 0 ], { %rs41 }; + // end inline asm + .loc 1 70 4 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 50 +.b8 104 +.b8 105 +.b8 106 +.b8 51 +.b8 104 +.b8 109 +.b8 108 +.b8 111 +.b8 117 +.b8 109 +.b8 120 +.b8 100 +.b8 109 +.b8 104 +.b8 117 +.b8 101 +.b8 122 +.b8 115 +.b8 121 +.b8 104 +.b8 107 +.b8 109 +.b8 110 +.b8 113 +.b8 103 +.b8 110 +.b8 102 +.b8 97 +.b8 53 +.b8 105 +.b8 118 +.b8 114 +.b8 101 +.b8 50 +.b8 55 +.b8 117 +.b8 111 +.b8 115 +.b8 121 +.b8 109 +.b8 97 +.b8 109 +.b8 51 +.b8 100 +.b8 114 +.b8 55 +.b8 97 +.b8 53 +.b8 120 +.b8 98 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 50 +.b8 104 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source new file mode 100644 index 0000000000000000000000000000000000000000..1092c1a412ae5774676d09a311853fa35927b82c --- /dev/null +++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source @@ -0,0 +1,388 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc97 = loc("in_ptr0"(#loc)) +#loc98 = loc("in_ptr1"(#loc)) +#loc99 = loc("in_ptr2"(#loc)) +#loc100 = loc("in_ptr3"(#loc)) +#loc101 = loc("in_ptr4"(#loc)) +#loc102 = loc("in_ptr5"(#loc)) +#loc103 = loc("out_ptr0"(#loc)) +#loc104 = loc("ynumel"(#loc)) +#loc105 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %ynumel_0 = arith.constant 73728 : i32 loc(#loc106) + %xnumel_1 = arith.constant 128 : i32 loc(#loc107) + %yoffset = tt.get_program_id y : i32 loc(#loc108) + %yoffset_2 = tt.get_program_id z : i32 loc(#loc109) + %yoffset_3 = tt.get_num_programs y : i32 loc(#loc110) + %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc111) + %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc112) + %yoffset_6 = arith.constant 1024 : i32 loc(#loc113) + %yoffset_7 = arith.constant 1024 : i32 loc(#loc113) + %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc113) + %yindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc114) + %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<1024xi32> -> tensor<1024x1xi32> loc(#loc115) + %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<1024x1xi32> loc(#loc116) + %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<1024x1xi32> loc(#loc116) + %ymask = arith.constant dense<73728> : tensor<1024x1xi32> loc(#loc117) + %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<1024x1xi32> loc(#loc117) + %xoffset = tt.get_program_id x : i32 loc(#loc118) + %xoffset_13 = arith.constant 1 : i32 loc(#loc119) + %xoffset_14 = arith.constant 1 : i32 loc(#loc119) + %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc119) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc120) + %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc121) + %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x1xi32> loc(#loc122) + %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x1xi32> loc(#loc122) + %xmask = arith.constant dense<128> : tensor<1x1xi32> loc(#loc123) + %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x1xi32> loc(#loc123) + %y1 = arith.constant 32 : i32 loc(#loc124) + %y1_20 = arith.constant 32 : i32 loc(#loc124) + %y1_21 = arith.constant dense<32> : tensor<1024x1xi32> loc(#loc124) + %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<1024x1xi32> loc(#loc124) + %y0 = arith.constant 32 : i32 loc(#loc125) + %y0_23 = arith.constant 32 : i32 loc(#loc125) + %y0_24 = arith.constant dense<32> : tensor<1024x1xi32> loc(#loc125) + %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<1024x1xi32> loc(#loc125) + %tmp1 = arith.constant 0 : i64 loc(#loc126) + %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc126) + %tmp2 = arith.extsi %y1_22 : tensor<1024x1xi32> to tensor<1024x1xi64> loc(#loc127) + %tmp2_27 = arith.constant dense<0> : tensor<1024x1xi64> loc(#loc127) + %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<1024x1xi64> loc(#loc127) + %tmp3 = arith.constant 256 : i64 loc(#loc128) + %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc128) + %tmp4 = arith.extsi %y1_22 : tensor<1024x1xi32> to tensor<1024x1xi64> loc(#loc129) + %tmp4_30 = arith.constant dense<256> : tensor<1024x1xi64> loc(#loc129) + %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<1024x1xi64> loc(#loc129) + %tmp5 = arith.constant 128 : i32 loc(#loc130) + %tmp5_32 = arith.constant 128 : i32 loc(#loc130) + %tmp5_33 = arith.constant dense<128> : tensor<1024x1xi32> loc(#loc130) + %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<1024x1xi32> loc(#loc130) + %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x1xi32> -> tensor<1024x1xi32> loc(#loc131) + %tmp5_36 = arith.addi %tmp5_35, %tmp5_34 : tensor<1024x1xi32> loc(#loc131) + %tmp5_37 = arith.constant 12288 : i32 loc(#loc132) + %tmp5_38 = arith.constant 12288 : i32 loc(#loc132) + %tmp5_39 = arith.constant dense<12288> : tensor<1024x1xi32> loc(#loc132) + %tmp5_40 = arith.muli %tmp5_39, %y1_22 : tensor<1024x1xi32> loc(#loc132) + %tmp5_41 = arith.addi %tmp5_36, %tmp5_40 : tensor<1024x1xi32> loc(#loc133) + %tmp5_42 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc134) + %tmp5_43 = tt.addptr %tmp5_42, %tmp5_41 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc134) + %tmp5_44 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc135) + %tmp5_45 = arith.andi %tmp4_31, %tmp5_44 : tensor<1024x1xi1> loc(#loc135) + %tmp5_46 = arith.andi %tmp5_45, %ymask_12 : tensor<1024x1xi1> loc(#loc136) + %tmp5_47 = arith.constant 0.000000e+00 : f32 loc(#loc137) + %tmp5_48 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc137) + %tmp5_49 = arith.truncf %tmp5_48 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc137) + %tmp5_50 = tt.load %tmp5_43, %tmp5_46, %tmp5_49 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc137) + %tmp5_51 = arith.extf %tmp5_50 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc138) + %tmp7 = arith.constant 32 : i32 loc(#loc139) + %tmp7_52 = arith.constant 32 : i32 loc(#loc139) + %tmp7_53 = arith.constant dense<32> : tensor<1024x1xi32> loc(#loc139) + %tmp7_54 = arith.muli %tmp7_53, %y1_22 : tensor<1024x1xi32> loc(#loc139) + %tmp7_55 = arith.addi %y0_25, %tmp7_54 : tensor<1024x1xi32> loc(#loc140) + %tmp7_56 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc141) + %tmp7_57 = tt.addptr %tmp7_56, %tmp7_55 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc141) + %tmp7_58 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc142) + %tmp7_59 = arith.andi %tmp4_31, %tmp7_58 : tensor<1024x1xi1> loc(#loc142) + %tmp7_60 = arith.andi %tmp7_59, %ymask_12 : tensor<1024x1xi1> loc(#loc143) + %tmp7_61 = arith.constant 0.000000e+00 : f32 loc(#loc144) + %tmp7_62 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc144) + %tmp7_63 = tt.load %tmp7_57, %tmp7_60, %tmp7_62 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc144) + %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc145) + %tmp9 = arith.constant dense<1.280000e+02> : tensor<1024x1xf32> loc(#loc146) + %tmp9_64 = arith.divf %tmp7_63, %tmp9 : tensor<1024x1xf32> loc(#loc146) + %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc147) + %tmp11 = arith.constant dense<9.99999997E-7> : tensor<1024x1xf32> loc(#loc148) + %tmp11_65 = arith.addf %tmp9_64, %tmp11 : tensor<1024x1xf32> loc(#loc148) + %tmp12 = tt.extern_elementwise %tmp11_65 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32>) -> tensor<1024x1xf32> loc(#loc149) + %tmp13 = arith.mulf %tmp5_51, %tmp12 : tensor<1024x1xf32> loc(#loc150) + %tmp14 = tt.broadcast %xindex_18 : tensor<1x1xi32> -> tensor<1024x1xi32> loc(#loc151) + %tmp14_66 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc152) + %tmp14_67 = tt.addptr %tmp14_66, %tmp14 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc152) + %tmp14_68 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc153) + %tmp14_69 = arith.andi %tmp4_31, %tmp14_68 : tensor<1024x1xi1> loc(#loc153) + %tmp14_70 = arith.andi %tmp14_69, %ymask_12 : tensor<1024x1xi1> loc(#loc154) + %tmp14_71 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp14_72 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc155) + %tmp14_73 = arith.truncf %tmp14_72 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc155) + %tmp14_74 = tt.load %tmp14_67, %tmp14_70, %tmp14_73 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc155) + %tmp14_75 = arith.extf %tmp14_74 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc156) + %tmp16 = arith.mulf %tmp13, %tmp14_75 : tensor<1024x1xf32> loc(#loc157) + %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp18_76 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc158) + %tmp19 = arith.select %tmp4_31, %tmp16, %tmp18_76 : tensor<1024x1xi1>, tensor<1024x1xf32> loc(#loc159) + %tmp20 = arith.extsi %y1_22 : tensor<1024x1xi32> to tensor<1024x1xi64> loc(#loc160) + %tmp20_77 = arith.constant dense<256> : tensor<1024x1xi64> loc(#loc160) + %tmp20_78 = arith.cmpi sge, %tmp20, %tmp20_77 : tensor<1024x1xi64> loc(#loc160) + %tmp21 = arith.constant 2304 : i64 loc(#loc161) + %tmp21_79 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc161) + %tmp22 = arith.extsi %y1_22 : tensor<1024x1xi32> to tensor<1024x1xi64> loc(#loc162) + %tmp22_80 = arith.constant dense<2304> : tensor<1024x1xi64> loc(#loc162) + %tmp22_81 = arith.cmpi slt, %tmp22, %tmp22_80 : tensor<1024x1xi64> loc(#loc162) + %tmp23 = arith.constant 128 : i32 loc(#loc163) + %tmp23_82 = arith.constant 128 : i32 loc(#loc163) + %tmp23_83 = arith.constant dense<128> : tensor<1024x1xi32> loc(#loc163) + %tmp23_84 = arith.muli %tmp23_83, %y0_25 : tensor<1024x1xi32> loc(#loc163) + %tmp23_85 = tt.broadcast %xindex_18 : tensor<1x1xi32> -> tensor<1024x1xi32> loc(#loc164) + %tmp23_86 = arith.addi %tmp23_85, %tmp23_84 : tensor<1024x1xi32> loc(#loc164) + %tmp23_87 = arith.constant -256 : i32 loc(#loc165) + %tmp23_88 = arith.constant -256 : i32 loc(#loc165) + %tmp23_89 = arith.constant dense<-256> : tensor<1024x1xi32> loc(#loc165) + %tmp23_90 = arith.addi %tmp23_89, %y1_22 : tensor<1024x1xi32> loc(#loc165) + %tmp23_91 = arith.constant 12288 : i32 loc(#loc166) + %tmp23_92 = arith.constant 12288 : i32 loc(#loc166) + %tmp23_93 = arith.constant dense<12288> : tensor<1024x1xi32> loc(#loc166) + %tmp23_94 = arith.muli %tmp23_93, %tmp23_90 : tensor<1024x1xi32> loc(#loc166) + %tmp23_95 = arith.addi %tmp23_86, %tmp23_94 : tensor<1024x1xi32> loc(#loc167) + %tmp23_96 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc168) + %tmp23_97 = tt.addptr %tmp23_96, %tmp23_95 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc168) + %tmp23_98 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc169) + %tmp23_99 = arith.andi %tmp20_78, %tmp23_98 : tensor<1024x1xi1> loc(#loc169) + %tmp23_100 = arith.andi %tmp23_99, %ymask_12 : tensor<1024x1xi1> loc(#loc170) + %tmp23_101 = arith.constant 0.000000e+00 : f32 loc(#loc171) + %tmp23_102 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc171) + %tmp23_103 = arith.truncf %tmp23_102 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc171) + %tmp23_104 = tt.load %tmp23_97, %tmp23_100, %tmp23_103 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc171) + %tmp23_105 = arith.extf %tmp23_104 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc172) + %tmp25 = arith.constant -256 : i32 loc(#loc173) + %tmp25_106 = arith.constant -256 : i32 loc(#loc173) + %tmp25_107 = arith.constant dense<-256> : tensor<1024x1xi32> loc(#loc173) + %tmp25_108 = arith.addi %tmp25_107, %y1_22 : tensor<1024x1xi32> loc(#loc173) + %tmp25_109 = arith.constant 32 : i32 loc(#loc174) + %tmp25_110 = arith.constant 32 : i32 loc(#loc174) + %tmp25_111 = arith.constant dense<32> : tensor<1024x1xi32> loc(#loc174) + %tmp25_112 = arith.muli %tmp25_111, %tmp25_108 : tensor<1024x1xi32> loc(#loc174) + %tmp25_113 = arith.addi %y0_25, %tmp25_112 : tensor<1024x1xi32> loc(#loc175) + %tmp25_114 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc176) + %tmp25_115 = tt.addptr %tmp25_114, %tmp25_113 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc176) + %tmp25_116 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc177) + %tmp25_117 = arith.andi %tmp20_78, %tmp25_116 : tensor<1024x1xi1> loc(#loc177) + %tmp25_118 = arith.andi %tmp25_117, %ymask_12 : tensor<1024x1xi1> loc(#loc178) + %tmp25_119 = arith.constant 0.000000e+00 : f32 loc(#loc179) + %tmp25_120 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc179) + %tmp25_121 = tt.load %tmp25_115, %tmp25_118, %tmp25_120 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc179) + %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc180) + %tmp27 = arith.constant dense<1.280000e+02> : tensor<1024x1xf32> loc(#loc181) + %tmp27_122 = arith.divf %tmp25_121, %tmp27 : tensor<1024x1xf32> loc(#loc181) + %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc182) + %tmp29 = arith.constant dense<9.99999997E-7> : tensor<1024x1xf32> loc(#loc183) + %tmp29_123 = arith.addf %tmp27_122, %tmp29 : tensor<1024x1xf32> loc(#loc183) + %tmp30 = tt.extern_elementwise %tmp29_123 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32>) -> tensor<1024x1xf32> loc(#loc184) + %tmp31 = arith.mulf %tmp23_105, %tmp30 : tensor<1024x1xf32> loc(#loc185) + %tmp32 = tt.broadcast %xindex_18 : tensor<1x1xi32> -> tensor<1024x1xi32> loc(#loc186) + %tmp32_124 = tt.splat %in_ptr5 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc187) + %tmp32_125 = tt.addptr %tmp32_124, %tmp32 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc187) + %tmp32_126 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc188) + %tmp32_127 = arith.andi %tmp20_78, %tmp32_126 : tensor<1024x1xi1> loc(#loc188) + %tmp32_128 = arith.andi %tmp32_127, %ymask_12 : tensor<1024x1xi1> loc(#loc189) + %tmp32_129 = arith.constant 0.000000e+00 : f32 loc(#loc190) + %tmp32_130 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc190) + %tmp32_131 = arith.truncf %tmp32_130 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc190) + %tmp32_132 = tt.load %tmp32_125, %tmp32_128, %tmp32_131 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc190) + %tmp32_133 = arith.extf %tmp32_132 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc191) + %tmp34 = arith.mulf %tmp31, %tmp32_133 : tensor<1024x1xf32> loc(#loc192) + %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc193) + %tmp36_134 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc193) + %tmp37 = arith.select %tmp20_78, %tmp34, %tmp36_134 : tensor<1024x1xi1>, tensor<1024x1xf32> loc(#loc194) + %tmp38 = arith.select %tmp4_31, %tmp19, %tmp37 : tensor<1024x1xi1>, tensor<1024x1xf32> loc(#loc195) + %c128_i32 = arith.constant 128 : i32 loc(#loc91) + %c128_i32_135 = arith.constant 128 : i32 loc(#loc91) + %cst = arith.constant dense<128> : tensor<1024x1xi32> loc(#loc91) + %0 = arith.muli %cst, %yindex_11 : tensor<1024x1xi32> loc(#loc91) + %1 = tt.broadcast %xindex_18 : tensor<1x1xi32> -> tensor<1024x1xi32> loc(#loc92) + %2 = arith.addi %1, %0 : tensor<1024x1xi32> loc(#loc92) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc93) + %4 = tt.addptr %3, %2 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc93) + %5 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc94) + %6 = arith.andi %5, %ymask_12 : tensor<1024x1xi1> loc(#loc94) + %7 = arith.truncf %tmp38 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc95) + tt.store %4, %7, %6 : tensor<1024x1x!tt.ptr> loc(#loc95) + tt.return loc(#loc96) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc106 = loc("ynumel"(#loc1)) +#loc107 = loc("xnumel"(#loc2)) +#loc108 = loc("yoffset"(#loc3)) +#loc109 = loc("yoffset"(#loc4)) +#loc110 = loc("yoffset"(#loc5)) +#loc111 = loc("yoffset"(#loc6)) +#loc112 = loc("yoffset"(#loc7)) +#loc113 = loc("yoffset"(#loc8)) +#loc114 = loc("yindex"(#loc9)) +#loc115 = loc("yindex"(#loc10)) +#loc116 = loc("yindex"(#loc11)) +#loc117 = loc("ymask"(#loc12)) +#loc118 = loc("xoffset"(#loc13)) +#loc119 = loc("xoffset"(#loc14)) +#loc120 = loc("xindex"(#loc15)) +#loc121 = loc("xindex"(#loc16)) +#loc122 = loc("xindex"(#loc17)) +#loc123 = loc("xmask"(#loc18)) +#loc124 = loc("y1"(#loc19)) +#loc125 = loc("y0"(#loc20)) +#loc126 = loc("tmp1"(#loc21)) +#loc127 = loc("tmp2"(#loc22)) +#loc128 = loc("tmp3"(#loc23)) +#loc129 = loc("tmp4"(#loc24)) +#loc130 = loc("tmp5"(#loc25)) +#loc131 = loc("tmp5"(#loc26)) +#loc132 = loc("tmp5"(#loc27)) +#loc133 = loc("tmp5"(#loc28)) +#loc134 = loc("tmp5"(#loc29)) +#loc135 = loc("tmp5"(#loc30)) +#loc136 = loc("tmp5"(#loc31)) +#loc137 = loc("tmp5"(#loc32)) +#loc138 = loc("tmp5"(#loc33)) +#loc139 = loc("tmp7"(#loc34)) +#loc140 = loc("tmp7"(#loc35)) +#loc141 = loc("tmp7"(#loc36)) +#loc142 = loc("tmp7"(#loc37)) +#loc143 = loc("tmp7"(#loc38)) +#loc144 = loc("tmp7"(#loc39)) +#loc145 = loc("tmp8"(#loc40)) +#loc146 = loc("tmp9"(#loc41)) +#loc147 = loc("tmp10"(#loc42)) +#loc148 = loc("tmp11"(#loc43)) +#loc149 = loc("tmp12"(#loc44)) +#loc150 = loc("tmp13"(#loc45)) +#loc151 = loc("tmp14"(#loc46)) +#loc152 = loc("tmp14"(#loc47)) +#loc153 = loc("tmp14"(#loc48)) +#loc154 = loc("tmp14"(#loc49)) +#loc155 = loc("tmp14"(#loc50)) +#loc156 = loc("tmp14"(#loc51)) +#loc157 = loc("tmp16"(#loc52)) +#loc158 = loc("tmp18"(#loc53)) +#loc159 = loc("tmp19"(#loc54)) +#loc160 = loc("tmp20"(#loc55)) +#loc161 = loc("tmp21"(#loc56)) +#loc162 = loc("tmp22"(#loc57)) +#loc163 = loc("tmp23"(#loc58)) +#loc164 = loc("tmp23"(#loc59)) +#loc165 = loc("tmp23"(#loc60)) +#loc166 = loc("tmp23"(#loc61)) +#loc167 = loc("tmp23"(#loc62)) +#loc168 = loc("tmp23"(#loc63)) +#loc169 = loc("tmp23"(#loc64)) +#loc170 = loc("tmp23"(#loc65)) +#loc171 = loc("tmp23"(#loc66)) +#loc172 = loc("tmp23"(#loc67)) +#loc173 = loc("tmp25"(#loc68)) +#loc174 = loc("tmp25"(#loc69)) +#loc175 = loc("tmp25"(#loc70)) +#loc176 = loc("tmp25"(#loc71)) +#loc177 = loc("tmp25"(#loc72)) +#loc178 = loc("tmp25"(#loc73)) +#loc179 = loc("tmp25"(#loc74)) +#loc180 = loc("tmp26"(#loc75)) +#loc181 = loc("tmp27"(#loc76)) +#loc182 = loc("tmp28"(#loc77)) +#loc183 = loc("tmp29"(#loc78)) +#loc184 = loc("tmp30"(#loc79)) +#loc185 = loc("tmp31"(#loc80)) +#loc186 = loc("tmp32"(#loc81)) +#loc187 = loc("tmp32"(#loc82)) +#loc188 = loc("tmp32"(#loc83)) +#loc189 = loc("tmp32"(#loc84)) +#loc190 = loc("tmp32"(#loc85)) +#loc191 = loc("tmp32"(#loc86)) +#loc192 = loc("tmp34"(#loc87)) +#loc193 = loc("tmp36"(#loc88)) +#loc194 = loc("tmp37"(#loc89)) +#loc195 = loc("tmp38"(#loc90)) diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8088717d2b119b05218586033863c1a6e15d3d4a --- /dev/null +++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir @@ -0,0 +1,245 @@ +#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc68 = loc("in_ptr0"(#loc)) +#loc69 = loc("in_ptr1"(#loc)) +#loc70 = loc("in_ptr2"(#loc)) +#loc71 = loc("in_ptr3"(#loc)) +#loc72 = loc("in_ptr4"(#loc)) +#loc73 = loc("in_ptr5"(#loc)) +#loc74 = loc("out_ptr0"(#loc)) +#loc75 = loc("ynumel"(#loc)) +#loc76 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<-256> : tensor<1024x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<12288> : tensor<1024x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1024x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<1024x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<256> : tensor<1024x1xi64, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<1024x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<73728> : tensor<1024x1xi32, #blocked1> loc(#loc1) + %cst_6 = arith.constant dense<73728> : tensor<1024x1xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<1024x1xbf16, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_8 = arith.constant dense<9.99999997E-7> : tensor<1024x1xf32, #blocked> loc(#loc1) + %cst_9 = arith.constant dense<1.280000e+02> : tensor<1024x1xf32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32, #blocked> loc(#loc1) + %yoffset = tt.get_program_id y : i32 loc(#loc77) + %yoffset_11 = tt.get_program_id z : i32 loc(#loc78) + %yoffset_12 = tt.get_num_programs y : i32 loc(#loc79) + %yoffset_13 = arith.muli %yoffset_11, %yoffset_12 : i32 loc(#loc80) + %yoffset_14 = arith.addi %yoffset, %yoffset_13 : i32 loc(#loc81) + %yoffset_15 = arith.muli %yoffset_14, %c1024_i32 : i32 loc(#loc82) + %yindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc83) + %yindex_16 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc83) + %yindex_17 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<1024xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1024x1xi32, #blocked> loc(#loc83) + %yindex_18 = tt.expand_dims %yindex_16 {axis = 1 : i32} : tensor<1024xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1024x1xi32, #blocked1> loc(#loc83) + %yindex_19 = tt.splat %yoffset_15 : i32 -> tensor<1024x1xi32, #blocked> loc(#loc84) + %yindex_20 = tt.splat %yoffset_15 : i32 -> tensor<1024x1xi32, #blocked1> loc(#loc84) + %yindex_21 = arith.addi %yindex_19, %yindex_17 : tensor<1024x1xi32, #blocked> loc(#loc84) + %yindex_22 = arith.addi %yindex_20, %yindex_18 : tensor<1024x1xi32, #blocked1> loc(#loc84) + %ymask = arith.cmpi slt, %yindex_21, %cst_6 : tensor<1024x1xi32, #blocked> loc(#loc85) + %ymask_23 = arith.cmpi slt, %yindex_22, %cst_5 : tensor<1024x1xi32, #blocked1> loc(#loc85) + %xoffset = tt.get_program_id x : i32 loc(#loc86) + %xmask = arith.cmpi slt, %xoffset, %c128_i32 : i32 loc(#loc87) + %y1 = arith.divsi %yindex_21, %cst_4 : tensor<1024x1xi32, #blocked> loc(#loc88) + %y0 = arith.remsi %yindex_21, %cst_4 : tensor<1024x1xi32, #blocked> loc(#loc89) + %tmp4 = arith.extsi %y1 : tensor<1024x1xi32, #blocked> to tensor<1024x1xi64, #blocked> loc(#loc90) + %tmp4_24 = arith.cmpi slt, %tmp4, %cst_3 : tensor<1024x1xi64, #blocked> loc(#loc90) + %tmp5 = arith.muli %y0, %cst_2 : tensor<1024x1xi32, #blocked> loc(#loc91) + %tmp5_25 = tt.splat %xoffset : i32 -> tensor<1024x1xi32, #blocked> loc(#loc137) + %tmp5_26 = tt.splat %xoffset : i32 -> tensor<1024x1xi32, #blocked1> loc(#loc137) + %tmp5_27 = arith.addi %tmp5_25, %tmp5 : tensor<1024x1xi32, #blocked> loc(#loc92) + %tmp5_28 = arith.muli %y1, %cst_0 : tensor<1024x1xi32, #blocked> loc(#loc94) + %tmp5_29 = arith.addi %tmp5_27, %tmp5_28 : tensor<1024x1xi32, #blocked> loc(#loc95) + %tmp5_30 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x1x!tt.ptr, #blocked> loc(#loc96) + %tmp5_31 = tt.addptr %tmp5_30, %tmp5_29 : tensor<1024x1x!tt.ptr, #blocked>, tensor<1024x1xi32, #blocked> loc(#loc96) + %tmp5_32 = tt.splat %xmask : i1 -> tensor<1024x1xi1, #blocked> loc(#loc138) + %tmp5_33 = tt.splat %xmask : i1 -> tensor<1024x1xi1, #blocked1> loc(#loc138) + %tmp5_34 = arith.andi %tmp4_24, %tmp5_32 : tensor<1024x1xi1, #blocked> loc(#loc97) + %tmp5_35 = arith.andi %tmp5_34, %ymask : tensor<1024x1xi1, #blocked> loc(#loc98) + %tmp5_36 = tt.load %tmp5_31, %tmp5_35, %cst_7 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr, #blocked> loc(#loc99) + %tmp5_37 = arith.extf %tmp5_36 : tensor<1024x1xbf16, #blocked> to tensor<1024x1xf32, #blocked> loc(#loc100) + %tmp7 = arith.muli %y1, %cst_4 : tensor<1024x1xi32, #blocked> loc(#loc101) + %tmp7_38 = arith.addi %y0, %tmp7 : tensor<1024x1xi32, #blocked> loc(#loc102) + %tmp7_39 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x1x!tt.ptr, #blocked> loc(#loc103) + %tmp7_40 = tt.addptr %tmp7_39, %tmp7_38 : tensor<1024x1x!tt.ptr, #blocked>, tensor<1024x1xi32, #blocked> loc(#loc103) + %tmp7_41 = tt.load %tmp7_40, %tmp5_35, %cst_10 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr, #blocked> loc(#loc104) + %tmp9 = arith.divf %tmp7_41, %cst_9 : tensor<1024x1xf32, #blocked> loc(#loc105) + %tmp11 = arith.addf %tmp9, %cst_8 : tensor<1024x1xf32, #blocked> loc(#loc106) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32, #blocked>) -> tensor<1024x1xf32, #blocked> loc(#loc107) + %tmp13 = arith.mulf %tmp5_37, %tmp12 : tensor<1024x1xf32, #blocked> loc(#loc108) + %tmp14 = tt.addptr %in_ptr2, %xoffset : !tt.ptr, i32 loc(#loc109) + %tmp14_42 = tt.splat %tmp14 : !tt.ptr -> tensor<1024x1x!tt.ptr, #blocked> loc(#loc110) + %tmp14_43 = tt.load %tmp14_42, %tmp5_35, %cst_7 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr, #blocked> loc(#loc110) + %tmp14_44 = arith.extf %tmp14_43 : tensor<1024x1xbf16, #blocked> to tensor<1024x1xf32, #blocked> loc(#loc111) + %tmp16 = arith.mulf %tmp13, %tmp14_44 : tensor<1024x1xf32, #blocked> loc(#loc112) + %tmp20 = arith.cmpi sge, %tmp4, %cst_3 : tensor<1024x1xi64, #blocked> loc(#loc113) + %tmp23 = arith.addi %y1, %cst : tensor<1024x1xi32, #blocked> loc(#loc114) + %tmp23_45 = arith.muli %tmp23, %cst_0 : tensor<1024x1xi32, #blocked> loc(#loc115) + %tmp23_46 = arith.addi %tmp5_27, %tmp23_45 : tensor<1024x1xi32, #blocked> loc(#loc116) + %tmp23_47 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x1x!tt.ptr, #blocked> loc(#loc117) + %tmp23_48 = tt.addptr %tmp23_47, %tmp23_46 : tensor<1024x1x!tt.ptr, #blocked>, tensor<1024x1xi32, #blocked> loc(#loc117) + %tmp23_49 = arith.andi %tmp20, %tmp5_32 : tensor<1024x1xi1, #blocked> loc(#loc118) + %tmp23_50 = arith.andi %tmp23_49, %ymask : tensor<1024x1xi1, #blocked> loc(#loc119) + %tmp23_51 = tt.load %tmp23_48, %tmp23_50, %cst_7 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr, #blocked> loc(#loc120) + %tmp23_52 = arith.extf %tmp23_51 : tensor<1024x1xbf16, #blocked> to tensor<1024x1xf32, #blocked> loc(#loc121) + %tmp25 = arith.muli %tmp23, %cst_4 : tensor<1024x1xi32, #blocked> loc(#loc122) + %tmp25_53 = arith.addi %y0, %tmp25 : tensor<1024x1xi32, #blocked> loc(#loc123) + %tmp25_54 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1024x1x!tt.ptr, #blocked> loc(#loc124) + %tmp25_55 = tt.addptr %tmp25_54, %tmp25_53 : tensor<1024x1x!tt.ptr, #blocked>, tensor<1024x1xi32, #blocked> loc(#loc124) + %tmp25_56 = tt.load %tmp25_55, %tmp23_50, %cst_10 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr, #blocked> loc(#loc125) + %tmp27 = arith.divf %tmp25_56, %cst_9 : tensor<1024x1xf32, #blocked> loc(#loc126) + %tmp29 = arith.addf %tmp27, %cst_8 : tensor<1024x1xf32, #blocked> loc(#loc127) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32, #blocked>) -> tensor<1024x1xf32, #blocked> loc(#loc128) + %tmp31 = arith.mulf %tmp23_52, %tmp30 : tensor<1024x1xf32, #blocked> loc(#loc129) + %tmp32 = tt.addptr %in_ptr5, %xoffset : !tt.ptr, i32 loc(#loc130) + %tmp32_57 = tt.splat %tmp32 : !tt.ptr -> tensor<1024x1x!tt.ptr, #blocked> loc(#loc131) + %tmp32_58 = tt.load %tmp32_57, %tmp23_50, %cst_7 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr, #blocked> loc(#loc131) + %tmp32_59 = arith.extf %tmp32_58 : tensor<1024x1xbf16, #blocked> to tensor<1024x1xf32, #blocked> loc(#loc132) + %tmp34 = arith.mulf %tmp31, %tmp32_59 : tensor<1024x1xf32, #blocked> loc(#loc133) + %tmp37 = arith.select %tmp20, %tmp34, %cst_10 : tensor<1024x1xi1, #blocked>, tensor<1024x1xf32, #blocked> loc(#loc134) + %tmp38 = arith.select %tmp4_24, %tmp16, %tmp37 : tensor<1024x1xi1, #blocked>, tensor<1024x1xf32, #blocked> loc(#loc139) + %0 = arith.muli %yindex_22, %cst_1 : tensor<1024x1xi32, #blocked1> loc(#loc62) + %1 = arith.addi %tmp5_26, %0 : tensor<1024x1xi32, #blocked1> loc(#loc63) + %2 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x1x!tt.ptr, #blocked1> loc(#loc64) + %3 = tt.addptr %2, %1 : tensor<1024x1x!tt.ptr, #blocked1>, tensor<1024x1xi32, #blocked1> loc(#loc64) + %4 = arith.andi %tmp5_33, %ymask_23 : tensor<1024x1xi1, #blocked1> loc(#loc65) + %5 = arith.truncf %tmp38 : tensor<1024x1xf32, #blocked> to tensor<1024x1xbf16, #blocked> loc(#loc66) + %6 = ttg.convert_layout %5 : tensor<1024x1xbf16, #blocked> -> tensor<1024x1xbf16, #blocked1> loc(#loc66) + tt.store %3, %6, %4 : tensor<1024x1x!tt.ptr, #blocked1> loc(#loc66) + tt.return loc(#loc67) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc77 = loc("yoffset"(#loc2)) +#loc78 = loc("yoffset"(#loc3)) +#loc79 = loc("yoffset"(#loc4)) +#loc80 = loc("yoffset"(#loc5)) +#loc81 = loc("yoffset"(#loc6)) +#loc82 = loc("yoffset"(#loc7)) +#loc83 = loc("yindex"(#loc8)) +#loc84 = loc("yindex"(#loc9)) +#loc85 = loc("ymask"(#loc10)) +#loc86 = loc("xoffset"(#loc11)) +#loc87 = loc("xmask"(#loc12)) +#loc88 = loc("y1"(#loc13)) +#loc89 = loc("y0"(#loc14)) +#loc90 = loc("tmp4"(#loc15)) +#loc91 = loc("tmp5"(#loc16)) +#loc92 = loc("tmp5"(#loc17)) +#loc93 = loc("xindex"(#loc18)) +#loc94 = loc("tmp5"(#loc19)) +#loc95 = loc("tmp5"(#loc20)) +#loc96 = loc("tmp5"(#loc21)) +#loc97 = loc("tmp5"(#loc22)) +#loc98 = loc("tmp5"(#loc23)) +#loc99 = loc("tmp5"(#loc24)) +#loc100 = loc("tmp5"(#loc25)) +#loc101 = loc("tmp7"(#loc26)) +#loc102 = loc("tmp7"(#loc27)) +#loc103 = loc("tmp7"(#loc28)) +#loc104 = loc("tmp7"(#loc29)) +#loc105 = loc("tmp9"(#loc30)) +#loc106 = loc("tmp11"(#loc31)) +#loc107 = loc("tmp12"(#loc32)) +#loc108 = loc("tmp13"(#loc33)) +#loc109 = loc("tmp14"(#loc34)) +#loc110 = loc("tmp14"(#loc35)) +#loc111 = loc("tmp14"(#loc36)) +#loc112 = loc("tmp16"(#loc37)) +#loc113 = loc("tmp20"(#loc38)) +#loc114 = loc("tmp23"(#loc39)) +#loc115 = loc("tmp23"(#loc40)) +#loc116 = loc("tmp23"(#loc41)) +#loc117 = loc("tmp23"(#loc42)) +#loc118 = loc("tmp23"(#loc43)) +#loc119 = loc("tmp23"(#loc44)) +#loc120 = loc("tmp23"(#loc45)) +#loc121 = loc("tmp23"(#loc46)) +#loc122 = loc("tmp25"(#loc47)) +#loc123 = loc("tmp25"(#loc48)) +#loc124 = loc("tmp25"(#loc49)) +#loc125 = loc("tmp25"(#loc50)) +#loc126 = loc("tmp27"(#loc51)) +#loc127 = loc("tmp29"(#loc52)) +#loc128 = loc("tmp30"(#loc53)) +#loc129 = loc("tmp31"(#loc54)) +#loc130 = loc("tmp32"(#loc55)) +#loc131 = loc("tmp32"(#loc56)) +#loc132 = loc("tmp32"(#loc57)) +#loc133 = loc("tmp34"(#loc58)) +#loc134 = loc("tmp37"(#loc59)) +#loc135 = loc("tmp38"(#loc60)) +#loc136 = loc("tmp19"(#loc61)) +#loc137 = loc(fused[#loc92, #loc93]) +#loc138 = loc(fused[#loc97, #loc87]) +#loc139 = loc(fused[#loc135, #loc136]) diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a436cfae703c216844493123f1bec2ef26eec285 --- /dev/null +++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir @@ -0,0 +1,235 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc69 = loc("in_ptr0"(#loc)) +#loc70 = loc("in_ptr1"(#loc)) +#loc71 = loc("in_ptr2"(#loc)) +#loc72 = loc("in_ptr3"(#loc)) +#loc73 = loc("in_ptr4"(#loc)) +#loc74 = loc("in_ptr5"(#loc)) +#loc75 = loc("out_ptr0"(#loc)) +#loc76 = loc("ynumel"(#loc)) +#loc77 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xmask = arith.constant 128 : i32 loc(#loc78) + %cst = arith.constant dense<0.000000e+00> : tensor<1024x1xbf16> loc(#loc2) + %cst_0 = arith.constant dense<-256> : tensor<1024x1xi32> loc(#loc2) + %cst_1 = arith.constant dense<9.99999997E-7> : tensor<1024x1xf32> loc(#loc2) + %cst_2 = arith.constant dense<1.280000e+02> : tensor<1024x1xf32> loc(#loc2) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc2) + %cst_4 = arith.constant dense<12288> : tensor<1024x1xi32> loc(#loc2) + %cst_5 = arith.constant dense<128> : tensor<1024x1xi32> loc(#loc2) + %cst_6 = arith.constant dense<256> : tensor<1024x1xi64> loc(#loc2) + %cst_7 = arith.constant dense<32> : tensor<1024x1xi32> loc(#loc2) + %ymask = arith.constant dense<73728> : tensor<1024x1xi32> loc(#loc79) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc2) + %yoffset = tt.get_program_id y : i32 loc(#loc80) + %yoffset_8 = tt.get_program_id z : i32 loc(#loc81) + %yoffset_9 = tt.get_num_programs y : i32 loc(#loc82) + %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc83) + %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc84) + %yoffset_12 = arith.muli %yoffset_11, %c1024_i32 : i32 loc(#loc85) + %yindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc86) + %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<1024xi32> -> tensor<1024x1xi32> loc(#loc87) + %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<1024x1xi32> loc(#loc88) + %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<1024x1xi32> loc(#loc88) + %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<1024x1xi32> loc(#loc79) + %xoffset = tt.get_program_id x : i32 loc(#loc89) + %xmask_17 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc78) + %y1 = arith.divsi %yindex_15, %cst_7 : tensor<1024x1xi32> loc(#loc90) + %y0 = arith.remsi %yindex_15, %cst_7 : tensor<1024x1xi32> loc(#loc91) + %tmp4 = arith.extsi %y1 : tensor<1024x1xi32> to tensor<1024x1xi64> loc(#loc92) + %tmp4_18 = arith.cmpi slt, %tmp4, %cst_6 : tensor<1024x1xi64> loc(#loc92) + %tmp5 = arith.muli %y0, %cst_5 : tensor<1024x1xi32> loc(#loc93) + %tmp5_19 = tt.splat %xoffset : i32 -> tensor<1024x1xi32> loc(#loc139) + %tmp5_20 = arith.addi %tmp5_19, %tmp5 : tensor<1024x1xi32> loc(#loc94) + %tmp5_21 = arith.muli %y1, %cst_4 : tensor<1024x1xi32> loc(#loc96) + %tmp5_22 = arith.addi %tmp5_20, %tmp5_21 : tensor<1024x1xi32> loc(#loc97) + %tmp5_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc98) + %tmp5_24 = tt.addptr %tmp5_23, %tmp5_22 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc98) + %tmp5_25 = tt.splat %xmask_17 : i1 -> tensor<1024x1xi1> loc(#loc140) + %tmp5_26 = arith.andi %tmp4_18, %tmp5_25 : tensor<1024x1xi1> loc(#loc99) + %tmp5_27 = arith.andi %tmp5_26, %ymask_16 : tensor<1024x1xi1> loc(#loc100) + %tmp5_28 = tt.load %tmp5_24, %tmp5_27, %cst evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc101) + %tmp5_29 = arith.extf %tmp5_28 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc102) + %tmp7 = arith.muli %y1, %cst_7 : tensor<1024x1xi32> loc(#loc103) + %tmp7_30 = arith.addi %y0, %tmp7 : tensor<1024x1xi32> loc(#loc104) + %tmp7_31 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc105) + %tmp7_32 = tt.addptr %tmp7_31, %tmp7_30 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc105) + %tmp7_33 = tt.load %tmp7_32, %tmp5_27, %cst_3 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc106) + %tmp9 = arith.divf %tmp7_33, %cst_2 : tensor<1024x1xf32> loc(#loc107) + %tmp11 = arith.addf %tmp9, %cst_1 : tensor<1024x1xf32> loc(#loc108) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32>) -> tensor<1024x1xf32> loc(#loc109) + %tmp13 = arith.mulf %tmp5_29, %tmp12 : tensor<1024x1xf32> loc(#loc110) + %tmp14 = tt.addptr %in_ptr2, %xoffset : !tt.ptr, i32 loc(#loc111) + %tmp14_34 = tt.splat %tmp14 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc111) + %tmp14_35 = tt.load %tmp14_34, %tmp5_27, %cst evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc112) + %tmp14_36 = arith.extf %tmp14_35 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc113) + %tmp16 = arith.mulf %tmp13, %tmp14_36 : tensor<1024x1xf32> loc(#loc114) + %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<1024x1xi64> loc(#loc115) + %tmp23 = arith.addi %y1, %cst_0 : tensor<1024x1xi32> loc(#loc116) + %tmp23_37 = arith.muli %tmp23, %cst_4 : tensor<1024x1xi32> loc(#loc117) + %tmp23_38 = arith.addi %tmp5_20, %tmp23_37 : tensor<1024x1xi32> loc(#loc118) + %tmp23_39 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc119) + %tmp23_40 = tt.addptr %tmp23_39, %tmp23_38 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc119) + %tmp23_41 = arith.andi %tmp20, %tmp5_25 : tensor<1024x1xi1> loc(#loc120) + %tmp23_42 = arith.andi %tmp23_41, %ymask_16 : tensor<1024x1xi1> loc(#loc121) + %tmp23_43 = tt.load %tmp23_40, %tmp23_42, %cst evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc122) + %tmp23_44 = arith.extf %tmp23_43 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc123) + %tmp25 = arith.muli %tmp23, %cst_7 : tensor<1024x1xi32> loc(#loc124) + %tmp25_45 = arith.addi %y0, %tmp25 : tensor<1024x1xi32> loc(#loc125) + %tmp25_46 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc126) + %tmp25_47 = tt.addptr %tmp25_46, %tmp25_45 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc126) + %tmp25_48 = tt.load %tmp25_47, %tmp23_42, %cst_3 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc127) + %tmp27 = arith.divf %tmp25_48, %cst_2 : tensor<1024x1xf32> loc(#loc128) + %tmp29 = arith.addf %tmp27, %cst_1 : tensor<1024x1xf32> loc(#loc129) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32>) -> tensor<1024x1xf32> loc(#loc130) + %tmp31 = arith.mulf %tmp23_44, %tmp30 : tensor<1024x1xf32> loc(#loc131) + %tmp32 = tt.addptr %in_ptr5, %xoffset : !tt.ptr, i32 loc(#loc132) + %tmp32_49 = tt.splat %tmp32 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc132) + %tmp32_50 = tt.load %tmp32_49, %tmp23_42, %cst evictionPolicy = evict_last : tensor<1024x1x!tt.ptr> loc(#loc133) + %tmp32_51 = arith.extf %tmp32_50 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc134) + %tmp34 = arith.mulf %tmp31, %tmp32_51 : tensor<1024x1xf32> loc(#loc135) + %tmp37 = arith.select %tmp20, %tmp34, %cst_3 : tensor<1024x1xi1>, tensor<1024x1xf32> loc(#loc136) + %tmp38 = arith.select %tmp4_18, %tmp16, %tmp37 : tensor<1024x1xi1>, tensor<1024x1xf32> loc(#loc141) + %0 = arith.muli %yindex_15, %cst_5 : tensor<1024x1xi32> loc(#loc63) + %1 = arith.addi %tmp5_19, %0 : tensor<1024x1xi32> loc(#loc64) + %2 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x1x!tt.ptr> loc(#loc65) + %3 = tt.addptr %2, %1 : tensor<1024x1x!tt.ptr>, tensor<1024x1xi32> loc(#loc65) + %4 = arith.andi %tmp5_25, %ymask_16 : tensor<1024x1xi1> loc(#loc66) + %5 = arith.truncf %tmp38 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc67) + tt.store %3, %5, %4 : tensor<1024x1x!tt.ptr> loc(#loc67) + tt.return loc(#loc68) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc2 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc78 = loc("xmask"(#loc1)) +#loc79 = loc("ymask"(#loc3)) +#loc80 = loc("yoffset"(#loc4)) +#loc81 = loc("yoffset"(#loc5)) +#loc82 = loc("yoffset"(#loc6)) +#loc83 = loc("yoffset"(#loc7)) +#loc84 = loc("yoffset"(#loc8)) +#loc85 = loc("yoffset"(#loc9)) +#loc86 = loc("yindex"(#loc10)) +#loc87 = loc("yindex"(#loc11)) +#loc88 = loc("yindex"(#loc12)) +#loc89 = loc("xoffset"(#loc13)) +#loc90 = loc("y1"(#loc14)) +#loc91 = loc("y0"(#loc15)) +#loc92 = loc("tmp4"(#loc16)) +#loc93 = loc("tmp5"(#loc17)) +#loc94 = loc("tmp5"(#loc18)) +#loc95 = loc("xindex"(#loc19)) +#loc96 = loc("tmp5"(#loc20)) +#loc97 = loc("tmp5"(#loc21)) +#loc98 = loc("tmp5"(#loc22)) +#loc99 = loc("tmp5"(#loc23)) +#loc100 = loc("tmp5"(#loc24)) +#loc101 = loc("tmp5"(#loc25)) +#loc102 = loc("tmp5"(#loc26)) +#loc103 = loc("tmp7"(#loc27)) +#loc104 = loc("tmp7"(#loc28)) +#loc105 = loc("tmp7"(#loc29)) +#loc106 = loc("tmp7"(#loc30)) +#loc107 = loc("tmp9"(#loc31)) +#loc108 = loc("tmp11"(#loc32)) +#loc109 = loc("tmp12"(#loc33)) +#loc110 = loc("tmp13"(#loc34)) +#loc111 = loc("tmp14"(#loc35)) +#loc112 = loc("tmp14"(#loc36)) +#loc113 = loc("tmp14"(#loc37)) +#loc114 = loc("tmp16"(#loc38)) +#loc115 = loc("tmp20"(#loc39)) +#loc116 = loc("tmp23"(#loc40)) +#loc117 = loc("tmp23"(#loc41)) +#loc118 = loc("tmp23"(#loc42)) +#loc119 = loc("tmp23"(#loc43)) +#loc120 = loc("tmp23"(#loc44)) +#loc121 = loc("tmp23"(#loc45)) +#loc122 = loc("tmp23"(#loc46)) +#loc123 = loc("tmp23"(#loc47)) +#loc124 = loc("tmp25"(#loc48)) +#loc125 = loc("tmp25"(#loc49)) +#loc126 = loc("tmp25"(#loc50)) +#loc127 = loc("tmp25"(#loc51)) +#loc128 = loc("tmp27"(#loc52)) +#loc129 = loc("tmp29"(#loc53)) +#loc130 = loc("tmp30"(#loc54)) +#loc131 = loc("tmp31"(#loc55)) +#loc132 = loc("tmp32"(#loc56)) +#loc133 = loc("tmp32"(#loc57)) +#loc134 = loc("tmp32"(#loc58)) +#loc135 = loc("tmp34"(#loc59)) +#loc136 = loc("tmp37"(#loc60)) +#loc137 = loc("tmp38"(#loc61)) +#loc138 = loc("tmp19"(#loc62)) +#loc139 = loc(fused[#loc94, #loc95]) +#loc140 = loc(fused[#loc99, #loc78]) +#loc141 = loc(fused[#loc137, #loc138]) diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..84d8ce34d74e23c71524ab7c2b0da86972d0eef7 --- /dev/null +++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}} \ No newline at end of file diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..b1639b04b5f54f687154d6ac1d2ee4b09d1c59e3 Binary files /dev/null and b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c2dbcb990bca7cd4df3f246e84d379e2f2d85d47 --- /dev/null +++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"hash": "096ece262b42d5675d174abdb1346c7821319ff6915792a636b52f871e5e652f", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"} \ No newline at end of file diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..84b98ec516fe8bb215072712ba024e3414b134a1 --- /dev/null +++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir @@ -0,0 +1,779 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %13 = shl i32 %12, 6, !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %15 = and i32 %14, 126, !dbg !10 + %16 = lshr exact i32 %15, 1, !dbg !10 + %17 = or disjoint i32 %16, %13, !dbg !11 + %18 = shl nuw nsw i32 %14, 2, !dbg !12 + %19 = and i32 %18, 4, !dbg !12 + %20 = sdiv i32 %17, 32, !dbg !13 + %21 = shl i32 %17, 7 + %22 = shl i32 %20, 15 + %23 = add i32 %22, %21 + %24 = add i32 %23, 4096 + %25 = zext nneg i32 %19 to i64, !dbg !14 + br label %26, !dbg !14 + +26: ; preds = %11, %26 + %indvars.iv = phi i64 [ 0, %11 ], [ %indvars.iv.next, %26 ] + %27 = phi <8 x float> [ zeroinitializer, %11 ], [ %59, %26 ] + %28 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !15 + %29 = or disjoint i32 %19, %28, !dbg !15 + %30 = add i32 %24, %29, !dbg !15 + %31 = sext i32 %30 to i64, !dbg !16 + %32 = getelementptr bfloat, ptr addrspace(1) %2, i64 %31, !dbg !16 + %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %34 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %32, i64 %33, i1 true) #6, !dbg !17 + %35 = extractvalue { i32, i32 } %34, 0, !dbg !17 + %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !17 + %37 = extractvalue { i32, i32 } %34, 1, !dbg !17 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !17 + %39 = add i32 %23, %29, !dbg !18 + %40 = sext i32 %39 to i64, !dbg !19 + %41 = getelementptr bfloat, ptr addrspace(1) %2, i64 %40, !dbg !19 + %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !20 + %43 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %41, i64 %42, i1 true) #6, !dbg !20 + %44 = extractvalue { i32, i32 } %43, 0, !dbg !20 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !20 + %46 = extractvalue { i32, i32 } %43, 1, !dbg !20 + %47 = bitcast i32 %46 to <2 x bfloat>, !dbg !20 + %48 = shufflevector <2 x bfloat> %45, <2 x bfloat> %47, <8 x i32> , !dbg !21 + %49 = shufflevector <2 x bfloat> %36, <2 x bfloat> poison, <8 x i32> , !dbg !21 + %50 = shufflevector <8 x bfloat> %48, <8 x bfloat> %49, <8 x i32> , !dbg !21 + %51 = shufflevector <2 x bfloat> %36, <2 x bfloat> poison, <8 x i32> , !dbg !21 + %52 = shufflevector <8 x bfloat> %50, <8 x bfloat> %51, <8 x i32> , !dbg !21 + %53 = shufflevector <2 x bfloat> %38, <2 x bfloat> poison, <8 x i32> , !dbg !21 + %54 = shufflevector <8 x bfloat> %52, <8 x bfloat> %53, <8 x i32> , !dbg !21 + %55 = shufflevector <2 x bfloat> %38, <2 x bfloat> poison, <8 x i32> , !dbg !21 + %56 = shufflevector <8 x bfloat> %54, <8 x bfloat> %55, <8 x i32> , !dbg !21 + %57 = fpext <8 x bfloat> %56 to <8 x float>, !dbg !21 + %58 = fmul <8 x float> %57, %57, !dbg !22 + %59 = fadd <8 x float> %27, %58, !dbg !23 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8, !dbg !14 + %60 = icmp samesign ult i64 %indvars.iv, 120, !dbg !14 + br i1 %60, label %26, label %61, !dbg !14 + +61: ; preds = %26 + %62 = and i32 %14, 63, !dbg !10 + %63 = or disjoint i32 %13, %62, !dbg !11 + %64 = and i32 %14, 64, !dbg !12 + %65 = sdiv i32 %63, 32, !dbg !13 + %shift = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> , !dbg !24 + %foldExtExtBinop = fadd <8 x float> %59, %shift, !dbg !24 + %shift98 = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> , !dbg !24 + %foldExtExtBinop99 = fadd <8 x float> %shift98, %foldExtExtBinop, !dbg !24 + %shift101 = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> , !dbg !24 + %foldExtExtBinop102 = fadd <8 x float> %shift101, %foldExtExtBinop99, !dbg !24 + %66 = extractelement <8 x float> %foldExtExtBinop102, i64 4, !dbg !24 + %67 = bitcast float %66 to i32, !dbg !27 + %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 1, i32 31), !dbg !27 + %69 = bitcast i32 %68 to float, !dbg !27 + %70 = fadd float %66, %69, !dbg !24 + %shift104 = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> , !dbg !30 + %foldExtExtBinop105 = fadd <8 x float> %59, %shift104, !dbg !30 + %shift107 = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> , !dbg !30 + %foldExtExtBinop108 = fadd <8 x float> %shift107, %foldExtExtBinop105, !dbg !30 + %shift110 = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> , !dbg !30 + %foldExtExtBinop111 = fadd <8 x float> %shift110, %foldExtExtBinop108, !dbg !30 + %71 = extractelement <8 x float> %foldExtExtBinop111, i64 0, !dbg !30 + %72 = bitcast float %71 to i32, !dbg !31 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !31 + %74 = bitcast i32 %73 to float, !dbg !31 + %75 = fadd float %71, %74, !dbg !30 + %76 = shl i32 %20, 7, !dbg !33 + %77 = tail call float @llvm.nvvm.div.full(float %75, float 1.280000e+02), !dbg !34 + %78 = fadd float %77, 0x3EB0C6F7A0000000, !dbg !35 + %79 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i = icmp eq i32 %79, 0, !dbg !36 + br i1 %.not.i, label %82, label %80, !dbg !36 + +80: ; preds = %61 + %81 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %78), !dbg !36 + br label %__nv_rsqrtf.exit, !dbg !36 + +82: ; preds = %61 + %83 = tail call float @llvm.nvvm.rsqrt.approx.f(float %78), !dbg !36 + br label %__nv_rsqrtf.exit, !dbg !36 + +__nv_rsqrtf.exit: ; preds = %80, %82 + %.0.i = phi float [ %81, %80 ], [ %83, %82 ], !dbg !36 + %84 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %85 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i28 = icmp eq i32 %86, 0, !dbg !36 + br i1 %.not.i28, label %89, label %87, !dbg !36 + +87: ; preds = %__nv_rsqrtf.exit + %88 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %78), !dbg !36 + br label %__nv_rsqrtf.exit30, !dbg !36 + +89: ; preds = %__nv_rsqrtf.exit + %90 = tail call float @llvm.nvvm.rsqrt.approx.f(float %78), !dbg !36 + br label %__nv_rsqrtf.exit30, !dbg !36 + +__nv_rsqrtf.exit30: ; preds = %87, %89 + %.0.i29 = phi float [ %88, %87 ], [ %90, %89 ], !dbg !36 + %91 = shl nuw nsw i32 %15, 1, !dbg !37 + %92 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %91, !dbg !37 + store float %.0.i, ptr addrspace(3) %92, align 4, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %93 = shl nuw nsw i32 %62, 2, !dbg !37 + %94 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %93, !dbg !37 + %95 = load float, ptr addrspace(3) %94, align 4, !dbg !37 + %96 = tail call float @llvm.nvvm.div.full(float %70, float 1.280000e+02), !dbg !38 + %97 = fadd float %96, 0x3EB0C6F7A0000000, !dbg !39 + %98 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %.not.i31 = icmp eq i32 %98, 0, !dbg !40 + br i1 %.not.i31, label %101, label %99, !dbg !40 + +99: ; preds = %__nv_rsqrtf.exit30 + %100 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %97), !dbg !40 + br label %__nv_rsqrtf.exit33, !dbg !40 + +101: ; preds = %__nv_rsqrtf.exit30 + %102 = tail call float @llvm.nvvm.rsqrt.approx.f(float %97), !dbg !40 + br label %__nv_rsqrtf.exit33, !dbg !40 + +__nv_rsqrtf.exit33: ; preds = %99, %101 + %.0.i32 = phi float [ %100, %99 ], [ %102, %101 ], !dbg !40 + %103 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %104 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %.not.i40 = icmp eq i32 %105, 0, !dbg !40 + br i1 %.not.i40, label %108, label %106, !dbg !40 + +106: ; preds = %__nv_rsqrtf.exit33 + %107 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %97), !dbg !40 + br label %__nv_rsqrtf.exit42, !dbg !40 + +108: ; preds = %__nv_rsqrtf.exit33 + %109 = tail call float @llvm.nvvm.rsqrt.approx.f(float %97), !dbg !40 + br label %__nv_rsqrtf.exit42, !dbg !40 + +__nv_rsqrtf.exit42: ; preds = %106, %108 + %.0.i41 = phi float [ %107, %106 ], [ %109, %108 ], !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41 + store float %.0.i32, ptr addrspace(3) %92, align 4, !dbg !41 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41 + %110 = load float, ptr addrspace(3) %94, align 4, !dbg !41 + %111 = shl i32 %17, 7, !dbg !42 + %112 = shl nuw nsw i32 %14, 3 + %113 = and i32 %112, 120 + %114 = and i32 %18, 384 + %115 = and i32 %14, 16 + %116 = icmp eq i32 %115, 0 + %117 = select i1 %116, i32 0, i32 1032 + %118 = or disjoint i32 %113, %114 + %119 = xor i32 %118, %117 + %120 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %119 + %121 = getelementptr inbounds nuw i8, ptr addrspace(3) %120, i32 512 + %122 = shl nuw nsw i32 %14, 4 + %123 = and i32 %122, 112 + %124 = and i32 %112, 896 + %125 = and i32 %14, 8 + %126 = icmp eq i32 %125, 0 + %127 = select i1 %126, i32 0, i32 1032 + %128 = or disjoint i32 %123, %124 + %129 = or disjoint i32 %128, %127 + %130 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %129 + %131 = xor i32 %129, 8 + %132 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %131 + %133 = icmp eq i32 %64, 0 + %134 = shl i32 %63, 7 + %135 = shl i32 %65, 15 + %136 = add i32 %135, %134 + %137 = icmp ne i32 %64, 0 + %138 = add i32 %136, 4097 + %139 = add i32 %136, 4099 + %140 = add i32 %136, 4101 + %141 = add i32 %136, 4103 + %142 = add i32 %136, 4096 + %143 = add i32 %136, 4098 + %144 = add i32 %136, 4100 + %145 = add i32 %136, 4102 + %146 = select i1 %116, i32 0, i32 516 + %147 = or disjoint i32 %114, %146 + %148 = or disjoint i32 %147, %113 + %149 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %148 + %150 = xor i32 %148, 4 + %151 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %150 + %152 = and i32 %18, 124 + %153 = and i32 %14, 32 + %154 = icmp eq i32 %153, 0 + %155 = select i1 %154, i32 0, i32 516 + %156 = shl nuw nsw i32 %64, 1 + %157 = xor i32 %155, %152 + %158 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %156 + %159 = getelementptr inbounds nuw i8, ptr addrspace(3) %158, i32 %157 + %160 = getelementptr inbounds nuw i8, ptr addrspace(3) %159, i32 256 + %161 = sext i32 %145 to i64, !dbg !43 + %162 = sext i32 %144 to i64, !dbg !43 + %163 = sext i32 %143 to i64, !dbg !43 + %164 = sext i32 %142 to i64, !dbg !43 + %165 = sext i32 %141 to i64, !dbg !43 + %166 = sext i32 %140 to i64, !dbg !43 + %167 = sext i32 %139 to i64, !dbg !43 + %168 = sext i32 %138 to i64, !dbg !43 + %169 = sext i32 %136 to i64, !dbg !43 + %170 = sext i32 %76 to i64, !dbg !43 + %171 = sext i32 %111 to i64, !dbg !43 + %invariant.gep = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43 + %invariant.gep60 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43 + %invariant.gep62 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43 + %invariant.gep64 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43 + %invariant.gep66 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43 + %invariant.gep68 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43 + %invariant.gep70 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43 + %invariant.gep72 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43 + %invariant.gep74 = getelementptr bfloat, ptr addrspace(1) %2, i64 %168, !dbg !43 + %invariant.gep76 = getelementptr bfloat, ptr addrspace(1) %2, i64 %167, !dbg !43 + %invariant.gep78 = getelementptr bfloat, ptr addrspace(1) %2, i64 %166, !dbg !43 + %invariant.gep80 = getelementptr bfloat, ptr addrspace(1) %2, i64 %165, !dbg !43 + %invariant.gep82 = getelementptr bfloat, ptr addrspace(1) %2, i64 %164, !dbg !43 + %invariant.gep84 = getelementptr bfloat, ptr addrspace(1) %2, i64 %163, !dbg !43 + %invariant.gep86 = getelementptr bfloat, ptr addrspace(1) %2, i64 %162, !dbg !43 + %invariant.gep88 = getelementptr bfloat, ptr addrspace(1) %2, i64 %161, !dbg !43 + %172 = insertelement <2 x i1> poison, i1 %133, i64 0, !dbg !44 + %173 = shufflevector <2 x i1> %172, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !44 + %174 = insertelement <2 x float> poison, float %95, i64 0, !dbg !37 + %175 = shufflevector <2 x float> %174, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !37 + %176 = insertelement <2 x float> poison, float %110, i64 0, !dbg !41 + %177 = shufflevector <2 x float> %176, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !41 + br label %178, !dbg !43 + +178: ; preds = %__nv_rsqrtf.exit42, %178 + %indvars.iv51 = phi i64 [ 0, %__nv_rsqrtf.exit42 ], [ %indvars.iv.next52, %178 ] + %179 = or disjoint i64 %indvars.iv51, %25, !dbg !45 + %180 = or disjoint i64 %indvars.iv51, 2, !dbg !46 + %181 = or disjoint i64 %indvars.iv51, 4, !dbg !46 + %182 = or disjoint i64 %indvars.iv51, 6, !dbg !46 + %183 = trunc nuw nsw i64 %179 to i32, !dbg !47 + %184 = add i32 %23, %183, !dbg !47 + %185 = sext i32 %184 to i64, !dbg !48 + %186 = getelementptr bfloat, ptr addrspace(1) %2, i64 %185, !dbg !48 + %187 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %188 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %186, i64 %187, i1 true) #6, !dbg !49 + %189 = extractvalue { i32, i32 } %188, 0, !dbg !49 + %190 = bitcast i32 %189 to <2 x bfloat>, !dbg !49 + %191 = extractvalue { i32, i32 } %188, 1, !dbg !49 + %192 = bitcast i32 %191 to <2 x bfloat>, !dbg !49 + %193 = extractelement <2 x bfloat> %190, i64 0, !dbg !49 + %194 = extractelement <2 x bfloat> %190, i64 1, !dbg !49 + %195 = extractelement <2 x bfloat> %192, i64 0, !dbg !49 + %196 = extractelement <2 x bfloat> %192, i64 1, !dbg !49 + %197 = fpext bfloat %193 to float, !dbg !50 + %198 = fpext bfloat %194 to float, !dbg !50 + %199 = fpext bfloat %195 to float, !dbg !50 + %200 = fpext bfloat %196 to float, !dbg !50 + %201 = getelementptr bfloat, ptr addrspace(1) %3, i64 %179, !dbg !51 + %202 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !52 + %203 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %201, i64 %202, i1 true) #6, !dbg !52 + %204 = extractvalue { i32, i32 } %203, 0, !dbg !52 + %205 = bitcast i32 %204 to <2 x bfloat>, !dbg !52 + %206 = extractvalue { i32, i32 } %203, 1, !dbg !52 + %207 = bitcast i32 %206 to <2 x bfloat>, !dbg !52 + %208 = extractelement <2 x bfloat> %205, i64 0, !dbg !52 + %209 = extractelement <2 x bfloat> %205, i64 1, !dbg !52 + %210 = extractelement <2 x bfloat> %207, i64 0, !dbg !52 + %211 = extractelement <2 x bfloat> %207, i64 1, !dbg !52 + %212 = fpext bfloat %208 to float, !dbg !53 + %213 = fpext bfloat %209 to float, !dbg !53 + %214 = fpext bfloat %210 to float, !dbg !53 + %215 = fpext bfloat %211 to float, !dbg !53 + %216 = add nuw nsw i64 %179, %170, !dbg !54 + %217 = getelementptr float, ptr addrspace(1) %4, i64 %216, !dbg !55 + %218 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !56 + %219 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %217, i64 %218, i1 true) #6, !dbg !56 + %220 = extractvalue { i32, i32, i32, i32 } %219, 0, !dbg !56 + %221 = extractvalue { i32, i32, i32, i32 } %219, 1, !dbg !56 + %222 = extractvalue { i32, i32, i32, i32 } %219, 2, !dbg !56 + %223 = extractvalue { i32, i32, i32, i32 } %219, 3, !dbg !56 + %224 = bitcast i32 %220 to float, !dbg !56 + %225 = bitcast i32 %221 to float, !dbg !56 + %226 = bitcast i32 %222 to float, !dbg !56 + %227 = bitcast i32 %223 to float, !dbg !56 + %228 = getelementptr float, ptr addrspace(1) %5, i64 %216, !dbg !57 + %229 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %230 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %228, i64 %229, i1 true) #6, !dbg !58 + %231 = extractvalue { i32, i32, i32, i32 } %230, 0, !dbg !58 + %232 = extractvalue { i32, i32, i32, i32 } %230, 1, !dbg !58 + %233 = extractvalue { i32, i32, i32, i32 } %230, 2, !dbg !58 + %234 = extractvalue { i32, i32, i32, i32 } %230, 3, !dbg !58 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !58 + %235 = insertelement <2 x i32> poison, i32 %231, i64 0, !dbg !58 + %236 = insertelement <2 x i32> %235, i32 %233, i64 1, !dbg !58 + store <2 x i32> %236, ptr addrspace(3) %120, align 8, !dbg !58 + %237 = insertelement <2 x i32> poison, i32 %232, i64 0, !dbg !58 + %238 = insertelement <2 x i32> %237, i32 %234, i64 1, !dbg !58 + store <2 x i32> %238, ptr addrspace(3) %121, align 8, !dbg !58 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !58 + %239 = add i32 %24, %183, !dbg !59 + %240 = sext i32 %239 to i64, !dbg !60 + %241 = getelementptr bfloat, ptr addrspace(1) %2, i64 %240, !dbg !60 + %242 = getelementptr bfloat, ptr addrspace(1) %6, i64 %179, !dbg !61 + %243 = or disjoint i64 %indvars.iv51, 1, !dbg !62 + %244 = or disjoint i64 %indvars.iv51, 3, !dbg !62 + %245 = or disjoint i64 %indvars.iv51, 5, !dbg !62 + %246 = or disjoint i64 %indvars.iv51, 7, !dbg !62 + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %243, !dbg !63 + %gep61 = getelementptr bfloat, ptr addrspace(1) %invariant.gep60, i64 %244, !dbg !63 + %gep63 = getelementptr bfloat, ptr addrspace(1) %invariant.gep62, i64 %245, !dbg !63 + %gep65 = getelementptr bfloat, ptr addrspace(1) %invariant.gep64, i64 %246, !dbg !63 + %247 = getelementptr bfloat, ptr addrspace(1) %3, i64 %243, !dbg !64 + %248 = getelementptr bfloat, ptr addrspace(1) %3, i64 %244, !dbg !64 + %249 = getelementptr bfloat, ptr addrspace(1) %3, i64 %245, !dbg !64 + %250 = getelementptr bfloat, ptr addrspace(1) %3, i64 %246, !dbg !64 + %gep67 = getelementptr bfloat, ptr addrspace(1) %invariant.gep66, i64 %indvars.iv51, !dbg !65 + %gep69 = getelementptr bfloat, ptr addrspace(1) %invariant.gep68, i64 %180, !dbg !65 + %gep71 = getelementptr bfloat, ptr addrspace(1) %invariant.gep70, i64 %181, !dbg !65 + %gep73 = getelementptr bfloat, ptr addrspace(1) %invariant.gep72, i64 %182, !dbg !65 + %251 = getelementptr bfloat, ptr addrspace(1) %3, i64 %indvars.iv51, !dbg !66 + %252 = getelementptr bfloat, ptr addrspace(1) %3, i64 %180, !dbg !66 + %253 = getelementptr bfloat, ptr addrspace(1) %3, i64 %181, !dbg !66 + %254 = getelementptr bfloat, ptr addrspace(1) %3, i64 %182, !dbg !66 + %255 = fmul float %.0.i29, %197, !dbg !67 + %256 = fmul float %.0.i29, %198, !dbg !67 + %257 = fmul float %.0.i29, %199, !dbg !67 + %258 = fmul float %.0.i29, %200, !dbg !67 + %259 = fmul float %255, %212, !dbg !68 + %260 = fmul float %256, %213, !dbg !68 + %261 = fmul float %257, %214, !dbg !68 + %262 = fmul float %258, %215, !dbg !68 + %263 = fmul float %259, %224, !dbg !69 + %264 = fmul float %260, %225, !dbg !69 + %265 = fmul float %261, %226, !dbg !69 + %266 = fmul float %262, %227, !dbg !69 + %267 = insertelement <2 x float> poison, float %263, i64 0, !dbg !69 + %268 = insertelement <2 x float> %267, float %265, i64 1, !dbg !69 + %269 = insertelement <2 x float> poison, float %264, i64 0, !dbg !69 + %270 = insertelement <2 x float> %269, float %266, i64 1, !dbg !69 + %gep75 = getelementptr bfloat, ptr addrspace(1) %invariant.gep74, i64 %indvars.iv51, !dbg !70 + %gep77 = getelementptr bfloat, ptr addrspace(1) %invariant.gep76, i64 %indvars.iv51, !dbg !70 + %gep79 = getelementptr bfloat, ptr addrspace(1) %invariant.gep78, i64 %indvars.iv51, !dbg !70 + %gep81 = getelementptr bfloat, ptr addrspace(1) %invariant.gep80, i64 %indvars.iv51, !dbg !70 + %271 = getelementptr bfloat, ptr addrspace(1) %6, i64 %243, !dbg !71 + %272 = getelementptr bfloat, ptr addrspace(1) %6, i64 %244, !dbg !71 + %273 = getelementptr bfloat, ptr addrspace(1) %6, i64 %245, !dbg !71 + %274 = getelementptr bfloat, ptr addrspace(1) %6, i64 %246, !dbg !71 + %gep83 = getelementptr bfloat, ptr addrspace(1) %invariant.gep82, i64 %indvars.iv51, !dbg !72 + %gep85 = getelementptr bfloat, ptr addrspace(1) %invariant.gep84, i64 %indvars.iv51, !dbg !72 + %gep87 = getelementptr bfloat, ptr addrspace(1) %invariant.gep86, i64 %indvars.iv51, !dbg !72 + %gep89 = getelementptr bfloat, ptr addrspace(1) %invariant.gep88, i64 %indvars.iv51, !dbg !72 + %275 = getelementptr bfloat, ptr addrspace(1) %6, i64 %indvars.iv51, !dbg !73 + %276 = getelementptr bfloat, ptr addrspace(1) %6, i64 %180, !dbg !73 + %277 = getelementptr bfloat, ptr addrspace(1) %6, i64 %181, !dbg !73 + %278 = getelementptr bfloat, ptr addrspace(1) %6, i64 %182, !dbg !73 + %279 = add nuw nsw i64 %179, %171, !dbg !74 + %280 = getelementptr bfloat, ptr addrspace(1) %0, i64 %279, !dbg !75 + %281 = load <2 x float>, ptr addrspace(3) %130, align 8, !dbg !58 + %282 = load <2 x float>, ptr addrspace(3) %132, align 8, !dbg !58 + %283 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !76 + %284 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %241, i64 %283, i1 true) #6, !dbg !76 + %285 = extractvalue { i32, i32 } %284, 0, !dbg !76 + %286 = bitcast i32 %285 to <2 x bfloat>, !dbg !76 + %287 = extractvalue { i32, i32 } %284, 1, !dbg !76 + %288 = bitcast i32 %287 to <2 x bfloat>, !dbg !76 + %289 = extractelement <2 x bfloat> %286, i64 0, !dbg !76 + %290 = extractelement <2 x bfloat> %286, i64 1, !dbg !76 + %291 = extractelement <2 x bfloat> %288, i64 0, !dbg !76 + %292 = extractelement <2 x bfloat> %288, i64 1, !dbg !76 + %293 = fpext bfloat %289 to float, !dbg !77 + %294 = fpext bfloat %290 to float, !dbg !77 + %295 = fpext bfloat %291 to float, !dbg !77 + %296 = fpext bfloat %292 to float, !dbg !77 + %297 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !78 + %298 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %242, i64 %297, i1 true) #6, !dbg !78 + %299 = extractvalue { i32, i32 } %298, 0, !dbg !78 + %300 = bitcast i32 %299 to <2 x bfloat>, !dbg !78 + %301 = extractvalue { i32, i32 } %298, 1, !dbg !78 + %302 = bitcast i32 %301 to <2 x bfloat>, !dbg !78 + %303 = extractelement <2 x bfloat> %300, i64 0, !dbg !78 + %304 = extractelement <2 x bfloat> %300, i64 1, !dbg !78 + %305 = extractelement <2 x bfloat> %302, i64 0, !dbg !78 + %306 = extractelement <2 x bfloat> %302, i64 1, !dbg !78 + %307 = fpext bfloat %303 to float, !dbg !79 + %308 = fpext bfloat %304 to float, !dbg !79 + %309 = fpext bfloat %305 to float, !dbg !79 + %310 = fpext bfloat %306 to float, !dbg !79 + %311 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !80 + %312 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep, i64 %311, i1 %133) #6, !dbg !80 + %313 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !80 + %314 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep61, i64 %313, i1 %133) #6, !dbg !80 + %315 = insertelement <2 x i16> poison, i16 %312, i64 0, !dbg !80 + %316 = insertelement <2 x i16> %315, i16 %314, i64 1, !dbg !80 + %317 = bitcast <2 x i16> %316 to <2 x bfloat>, !dbg !80 + %318 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !80 + %319 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep63, i64 %318, i1 %133) #6, !dbg !80 + %320 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !80 + %321 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep65, i64 %320, i1 %133) #6, !dbg !80 + %322 = insertelement <2 x i16> poison, i16 %319, i64 0, !dbg !80 + %323 = insertelement <2 x i16> %322, i16 %321, i64 1, !dbg !80 + %324 = bitcast <2 x i16> %323 to <2 x bfloat>, !dbg !80 + %325 = fpext <2 x bfloat> %317 to <2 x float>, !dbg !81 + %326 = fpext <2 x bfloat> %324 to <2 x float>, !dbg !81 + %327 = fmul <2 x float> %175, %325, !dbg !37 + %328 = fmul <2 x float> %175, %326, !dbg !37 + %329 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !82 + %330 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %247, i64 %329, i1 %133) #6, !dbg !82 + %331 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !82 + %332 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %248, i64 %331, i1 %133) #6, !dbg !82 + %333 = insertelement <2 x i16> poison, i16 %330, i64 0, !dbg !82 + %334 = insertelement <2 x i16> %333, i16 %332, i64 1, !dbg !82 + %335 = bitcast <2 x i16> %334 to <2 x bfloat>, !dbg !82 + %336 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !82 + %337 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %249, i64 %336, i1 %133) #6, !dbg !82 + %338 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !82 + %339 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %250, i64 %338, i1 %133) #6, !dbg !82 + %340 = insertelement <2 x i16> poison, i16 %337, i64 0, !dbg !82 + %341 = insertelement <2 x i16> %340, i16 %339, i64 1, !dbg !82 + %342 = bitcast <2 x i16> %341 to <2 x bfloat>, !dbg !82 + %343 = fpext <2 x bfloat> %335 to <2 x float>, !dbg !83 + %344 = fpext <2 x bfloat> %342 to <2 x float>, !dbg !83 + %345 = fmul <2 x float> %327, %343, !dbg !84 + %346 = fmul <2 x float> %328, %344, !dbg !84 + %347 = fsub <2 x float> zeroinitializer, %345, !dbg !85 + %348 = fsub <2 x float> zeroinitializer, %346, !dbg !85 + %349 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !86 + %350 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep67, i64 %349, i1 %137) #6, !dbg !86 + %351 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !86 + %352 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep69, i64 %351, i1 %137) #6, !dbg !86 + %353 = insertelement <2 x i16> poison, i16 %350, i64 0, !dbg !86 + %354 = insertelement <2 x i16> %353, i16 %352, i64 1, !dbg !86 + %355 = bitcast <2 x i16> %354 to <2 x bfloat>, !dbg !86 + %356 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !86 + %357 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep71, i64 %356, i1 %137) #6, !dbg !86 + %358 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !86 + %359 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep73, i64 %358, i1 %137) #6, !dbg !86 + %360 = insertelement <2 x i16> poison, i16 %357, i64 0, !dbg !86 + %361 = insertelement <2 x i16> %360, i16 %359, i64 1, !dbg !86 + %362 = bitcast <2 x i16> %361 to <2 x bfloat>, !dbg !86 + %363 = fpext <2 x bfloat> %355 to <2 x float>, !dbg !87 + %364 = fpext <2 x bfloat> %362 to <2 x float>, !dbg !87 + %365 = fmul <2 x float> %175, %363, !dbg !88 + %366 = fmul <2 x float> %175, %364, !dbg !88 + %367 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !89 + %368 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %251, i64 %367, i1 %137) #6, !dbg !89 + %369 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !89 + %370 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %252, i64 %369, i1 %137) #6, !dbg !89 + %371 = insertelement <2 x i16> poison, i16 %368, i64 0, !dbg !89 + %372 = insertelement <2 x i16> %371, i16 %370, i64 1, !dbg !89 + %373 = bitcast <2 x i16> %372 to <2 x bfloat>, !dbg !89 + %374 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !89 + %375 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %253, i64 %374, i1 %137) #6, !dbg !89 + %376 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !89 + %377 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %254, i64 %376, i1 %137) #6, !dbg !89 + %378 = insertelement <2 x i16> poison, i16 %375, i64 0, !dbg !89 + %379 = insertelement <2 x i16> %378, i16 %377, i64 1, !dbg !89 + %380 = bitcast <2 x i16> %379 to <2 x bfloat>, !dbg !89 + %381 = fpext <2 x bfloat> %373 to <2 x float>, !dbg !90 + %382 = fpext <2 x bfloat> %380 to <2 x float>, !dbg !90 + %383 = fmul <2 x float> %365, %381, !dbg !91 + %384 = fmul <2 x float> %366, %382, !dbg !91 + %385 = select <2 x i1> %173, <2 x float> %347, <2 x float> %383, !dbg !44 + %386 = select <2 x i1> %173, <2 x float> %348, <2 x float> %384, !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !69 + store <2 x float> %268, ptr addrspace(3) %120, align 8, !dbg !69 + store <2 x float> %270, ptr addrspace(3) %121, align 8, !dbg !69 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !69 + %387 = load <2 x float>, ptr addrspace(3) %130, align 8, !dbg !69 + %388 = load <2 x float>, ptr addrspace(3) %132, align 8, !dbg !69 + %389 = fmul <2 x float> %281, %385, !dbg !92 + %390 = fmul <2 x float> %282, %386, !dbg !92 + %391 = fadd <2 x float> %389, %387, !dbg !93 + %392 = fadd <2 x float> %390, %388, !dbg !93 + %393 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !94 + %394 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep75, i64 %393, i1 %133) #6, !dbg !94 + %395 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !94 + %396 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep77, i64 %395, i1 %133) #6, !dbg !94 + %397 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !94 + %398 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep79, i64 %397, i1 %133) #6, !dbg !94 + %399 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !94 + %400 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep81, i64 %399, i1 %133) #6, !dbg !94 + %401 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !95 + %402 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %271, i64 %401, i1 %133) #6, !dbg !95 + %403 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !95 + %404 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %272, i64 %403, i1 %133) #6, !dbg !95 + %405 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !95 + %406 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %273, i64 %405, i1 %133) #6, !dbg !95 + %407 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !95 + %408 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %274, i64 %407, i1 %133) #6, !dbg !95 + %409 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %410 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep83, i64 %409, i1 %137) #6, !dbg !96 + %411 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %412 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep85, i64 %411, i1 %137) #6, !dbg !96 + %413 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %414 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep87, i64 %413, i1 %137) #6, !dbg !96 + %415 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %416 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep89, i64 %415, i1 %137) #6, !dbg !96 + %417 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %418 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %275, i64 %417, i1 %137) #6, !dbg !97 + %419 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %420 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %276, i64 %419, i1 %137) #6, !dbg !97 + %421 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %422 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %277, i64 %421, i1 %137) #6, !dbg !97 + %423 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %424 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %278, i64 %423, i1 %137) #6, !dbg !97 + %425 = fmul float %.0.i41, %293, !dbg !98 + %426 = fmul float %.0.i41, %294, !dbg !98 + %427 = fmul float %.0.i41, %295, !dbg !98 + %428 = fmul float %.0.i41, %296, !dbg !98 + %429 = fmul float %425, %307, !dbg !99 + %430 = fmul float %426, %308, !dbg !99 + %431 = fmul float %427, %309, !dbg !99 + %432 = fmul float %428, %310, !dbg !99 + %433 = fmul float %429, %224, !dbg !100 + %434 = fmul float %430, %225, !dbg !100 + %435 = fmul float %431, %226, !dbg !100 + %436 = fmul float %432, %227, !dbg !100 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !100 + %437 = insertelement <2 x float> poison, float %433, i64 0, !dbg !100 + %438 = insertelement <2 x float> %437, float %435, i64 1, !dbg !100 + store <2 x float> %438, ptr addrspace(3) %120, align 8, !dbg !100 + %439 = insertelement <2 x float> poison, float %434, i64 0, !dbg !100 + %440 = insertelement <2 x float> %439, float %436, i64 1, !dbg !100 + store <2 x float> %440, ptr addrspace(3) %121, align 8, !dbg !100 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !100 + %441 = fptrunc <2 x float> %391 to <2 x bfloat>, !dbg !101 + %442 = fptrunc <2 x float> %392 to <2 x bfloat>, !dbg !101 + %443 = getelementptr bfloat, ptr addrspace(1) %1, i64 %279, !dbg !102 + %444 = insertelement <2 x i16> poison, i16 %394, i64 0, !dbg !94 + %445 = insertelement <2 x i16> %444, i16 %396, i64 1, !dbg !94 + %446 = bitcast <2 x i16> %445 to <2 x bfloat>, !dbg !94 + %447 = fpext <2 x bfloat> %446 to <2 x float>, !dbg !103 + %448 = fmul <2 x float> %177, %447, !dbg !41 + %449 = insertelement <2 x i16> poison, i16 %402, i64 0, !dbg !95 + %450 = insertelement <2 x i16> %449, i16 %404, i64 1, !dbg !95 + %451 = bitcast <2 x i16> %450 to <2 x bfloat>, !dbg !95 + %452 = fpext <2 x bfloat> %451 to <2 x float>, !dbg !104 + %453 = fmul <2 x float> %448, %452, !dbg !105 + %454 = fsub <2 x float> zeroinitializer, %453, !dbg !106 + %455 = insertelement <2 x i16> poison, i16 %410, i64 0, !dbg !96 + %456 = insertelement <2 x i16> %455, i16 %412, i64 1, !dbg !96 + %457 = bitcast <2 x i16> %456 to <2 x bfloat>, !dbg !96 + %458 = fpext <2 x bfloat> %457 to <2 x float>, !dbg !107 + %459 = fmul <2 x float> %177, %458, !dbg !108 + %460 = insertelement <2 x i16> poison, i16 %418, i64 0, !dbg !97 + %461 = insertelement <2 x i16> %460, i16 %420, i64 1, !dbg !97 + %462 = bitcast <2 x i16> %461 to <2 x bfloat>, !dbg !97 + %463 = fpext <2 x bfloat> %462 to <2 x float>, !dbg !109 + %464 = fmul <2 x float> %459, %463, !dbg !110 + %465 = select <2 x i1> %173, <2 x float> %454, <2 x float> %464, !dbg !44 + %466 = load <2 x float>, ptr addrspace(3) %130, align 8, !dbg !100 + %467 = fmul <2 x float> %281, %465, !dbg !111 + %468 = fadd <2 x float> %467, %466, !dbg !112 + %469 = fptrunc <2 x float> %468 to <2 x bfloat>, !dbg !113 + %470 = insertelement <2 x i16> poison, i16 %398, i64 0, !dbg !94 + %471 = insertelement <2 x i16> %470, i16 %400, i64 1, !dbg !94 + %472 = bitcast <2 x i16> %471 to <2 x bfloat>, !dbg !94 + %473 = fpext <2 x bfloat> %472 to <2 x float>, !dbg !103 + %474 = fmul <2 x float> %177, %473, !dbg !41 + %475 = insertelement <2 x i16> poison, i16 %406, i64 0, !dbg !95 + %476 = insertelement <2 x i16> %475, i16 %408, i64 1, !dbg !95 + %477 = bitcast <2 x i16> %476 to <2 x bfloat>, !dbg !95 + %478 = fpext <2 x bfloat> %477 to <2 x float>, !dbg !104 + %479 = fmul <2 x float> %474, %478, !dbg !105 + %480 = fsub <2 x float> zeroinitializer, %479, !dbg !106 + %481 = insertelement <2 x i16> poison, i16 %414, i64 0, !dbg !96 + %482 = insertelement <2 x i16> %481, i16 %416, i64 1, !dbg !96 + %483 = bitcast <2 x i16> %482 to <2 x bfloat>, !dbg !96 + %484 = fpext <2 x bfloat> %483 to <2 x float>, !dbg !107 + %485 = fmul <2 x float> %177, %484, !dbg !108 + %486 = insertelement <2 x i16> poison, i16 %422, i64 0, !dbg !97 + %487 = insertelement <2 x i16> %486, i16 %424, i64 1, !dbg !97 + %488 = bitcast <2 x i16> %487 to <2 x bfloat>, !dbg !97 + %489 = fpext <2 x bfloat> %488 to <2 x float>, !dbg !109 + %490 = fmul <2 x float> %485, %489, !dbg !110 + %491 = select <2 x i1> %173, <2 x float> %480, <2 x float> %490, !dbg !44 + %492 = load <2 x float>, ptr addrspace(3) %132, align 8, !dbg !100 + %493 = fmul <2 x float> %282, %491, !dbg !111 + %494 = fadd <2 x float> %493, %492, !dbg !112 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !101 + store <2 x bfloat> %441, ptr addrspace(3) %149, align 4, !dbg !101 + store <2 x bfloat> %442, ptr addrspace(3) %151, align 4, !dbg !101 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !101 + %495 = load <2 x i16>, ptr addrspace(3) %159, align 4, !dbg !101 + %496 = load <2 x i16>, ptr addrspace(3) %160, align 4, !dbg !101 + %.uncasted = shufflevector <2 x i16> %495, <2 x i16> %496, <2 x i32> , !dbg !101 + %497 = bitcast <2 x i16> %.uncasted to i32, !dbg !101 + %498 = shufflevector <2 x i16> %495, <2 x i16> %496, <2 x i32> , !dbg !101 + %499 = bitcast <2 x i16> %498 to i32, !dbg !101 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %497, i32 %499, ptr addrspace(1) %280, i1 true) #6, !dbg !101 + %500 = fptrunc <2 x float> %494 to <2 x bfloat>, !dbg !113 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + store <2 x bfloat> %469, ptr addrspace(3) %149, align 4, !dbg !113 + store <2 x bfloat> %500, ptr addrspace(3) %151, align 4, !dbg !113 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + %501 = load <2 x i16>, ptr addrspace(3) %159, align 4, !dbg !113 + %502 = load <2 x i16>, ptr addrspace(3) %160, align 4, !dbg !113 + %.uncasted21 = shufflevector <2 x i16> %501, <2 x i16> %502, <2 x i32> , !dbg !113 + %503 = bitcast <2 x i16> %.uncasted21 to i32, !dbg !113 + %504 = shufflevector <2 x i16> %501, <2 x i16> %502, <2 x i32> , !dbg !113 + %505 = bitcast <2 x i16> %504 to i32, !dbg !113 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %503, i32 %505, ptr addrspace(1) %443, i1 true) #6, !dbg !113 + %indvars.iv.next52 = add nuw nsw i64 %indvars.iv51, 8, !dbg !43 + %506 = icmp samesign ult i64 %indvars.iv51, 120, !dbg !43 + br i1 %506, label %178, label %507, !dbg !43 + +507: ; preds = %178 + ret void, !dbg !114 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 23, column: 33, scope: !5) +!10 = !DILocation(line: 24, column: 44, scope: !5) +!11 = !DILocation(line: 24, column: 23, scope: !5) +!12 = !DILocation(line: 26, column: 37, scope: !5) +!13 = !DILocation(line: 29, column: 19, scope: !5) +!14 = !DILocation(line: 33, column: 43, scope: !5) +!15 = !DILocation(line: 39, column: 57, scope: !5) +!16 = !DILocation(line: 39, column: 34, scope: !5) +!17 = !DILocation(line: 39, column: 68, scope: !5) +!18 = !DILocation(line: 40, column: 50, scope: !5) +!19 = !DILocation(line: 40, column: 34, scope: !5) +!20 = !DILocation(line: 40, column: 61, scope: !5) +!21 = !DILocation(line: 40, column: 114, scope: !5) +!22 = !DILocation(line: 47, column: 22, scope: !5) +!23 = !DILocation(line: 49, column: 25, scope: !5) +!24 = !DILocation(line: 263, column: 15, scope: !25, inlinedAt: !27) +!25 = distinct !DILexicalBlockFile(scope: !5, file: !26, discriminator: 0) +!26 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!27 = !DILocation(line: 293, column: 36, scope: !25, inlinedAt: !28) +!28 = !DILocation(line: 51, column: 25, scope: !29) +!29 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!30 = !DILocation(line: 263, column: 15, scope: !25, inlinedAt: !31) +!31 = !DILocation(line: 293, column: 36, scope: !25, inlinedAt: !32) +!32 = !DILocation(line: 52, column: 27, scope: !29) +!33 = !DILocation(line: 63, column: 46, scope: !5) +!34 = !DILocation(line: 75, column: 25, scope: !5) +!35 = !DILocation(line: 77, column: 24, scope: !5) +!36 = !DILocation(line: 78, column: 32, scope: !5) +!37 = !DILocation(line: 79, column: 24, scope: !5) +!38 = !DILocation(line: 123, column: 24, scope: !5) +!39 = !DILocation(line: 124, column: 24, scope: !5) +!40 = !DILocation(line: 125, column: 32, scope: !5) +!41 = !DILocation(line: 126, column: 24, scope: !5) +!42 = !DILocation(line: 161, column: 43, scope: !5) +!43 = !DILocation(line: 53, column: 43, scope: !5) +!44 = !DILocation(line: 0, scope: !5) +!45 = !DILocation(line: 54, column: 31, scope: !5) +!46 = !DILocation(line: 59, column: 27, scope: !5) +!47 = !DILocation(line: 61, column: 51, scope: !5) +!48 = !DILocation(line: 61, column: 35, scope: !5) +!49 = !DILocation(line: 61, column: 62, scope: !5) +!50 = !DILocation(line: 61, column: 115, scope: !5) +!51 = !DILocation(line: 62, column: 35, scope: !5) +!52 = !DILocation(line: 62, column: 42, scope: !5) +!53 = !DILocation(line: 62, column: 95, scope: !5) +!54 = !DILocation(line: 63, column: 42, scope: !5) +!55 = !DILocation(line: 63, column: 35, scope: !5) +!56 = !DILocation(line: 63, column: 51, scope: !5) +!57 = !DILocation(line: 64, column: 35, scope: !5) +!58 = !DILocation(line: 64, column: 51, scope: !5) +!59 = !DILocation(line: 65, column: 58, scope: !5) +!60 = !DILocation(line: 65, column: 35, scope: !5) +!61 = !DILocation(line: 66, column: 36, scope: !5) +!62 = !DILocation(line: 72, column: 39, scope: !5) +!63 = !DILocation(line: 72, column: 35, scope: !5) +!64 = !DILocation(line: 80, column: 35, scope: !5) +!65 = !DILocation(line: 90, column: 35, scope: !5) +!66 = !DILocation(line: 98, column: 35, scope: !5) +!67 = !DILocation(line: 111, column: 24, scope: !5) +!68 = !DILocation(line: 113, column: 24, scope: !5) +!69 = !DILocation(line: 116, column: 24, scope: !5) +!70 = !DILocation(line: 121, column: 35, scope: !5) +!71 = !DILocation(line: 127, column: 35, scope: !5) +!72 = !DILocation(line: 134, column: 35, scope: !5) +!73 = !DILocation(line: 140, column: 35, scope: !5) +!74 = !DILocation(line: 161, column: 39, scope: !5) +!75 = !DILocation(line: 161, column: 32, scope: !5) +!76 = !DILocation(line: 65, column: 69, scope: !5) +!77 = !DILocation(line: 65, column: 123, scope: !5) +!78 = !DILocation(line: 66, column: 43, scope: !5) +!79 = !DILocation(line: 66, column: 96, scope: !5) +!80 = !DILocation(line: 72, column: 68, scope: !5) +!81 = !DILocation(line: 72, column: 129, scope: !5) +!82 = !DILocation(line: 80, column: 85, scope: !5) +!83 = !DILocation(line: 80, column: 146, scope: !5) +!84 = !DILocation(line: 82, column: 24, scope: !5) +!85 = !DILocation(line: 84, column: 17, scope: !5) +!86 = !DILocation(line: 90, column: 64, scope: !5) +!87 = !DILocation(line: 90, column: 125, scope: !5) +!88 = !DILocation(line: 97, column: 24, scope: !5) +!89 = !DILocation(line: 98, column: 81, scope: !5) +!90 = !DILocation(line: 98, column: 142, scope: !5) +!91 = !DILocation(line: 100, column: 24, scope: !5) +!92 = !DILocation(line: 118, column: 24, scope: !5) +!93 = !DILocation(line: 119, column: 24, scope: !5) +!94 = !DILocation(line: 121, column: 71, scope: !5) +!95 = !DILocation(line: 127, column: 85, scope: !5) +!96 = !DILocation(line: 134, column: 71, scope: !5) +!97 = !DILocation(line: 140, column: 81, scope: !5) +!98 = !DILocation(line: 151, column: 25, scope: !5) +!99 = !DILocation(line: 153, column: 26, scope: !5) +!100 = !DILocation(line: 156, column: 26, scope: !5) +!101 = !DILocation(line: 161, column: 55, scope: !5) +!102 = !DILocation(line: 162, column: 32, scope: !5) +!103 = !DILocation(line: 121, column: 132, scope: !5) +!104 = !DILocation(line: 127, column: 146, scope: !5) +!105 = !DILocation(line: 129, column: 24, scope: !5) +!106 = !DILocation(line: 131, column: 17, scope: !5) +!107 = !DILocation(line: 134, column: 132, scope: !5) +!108 = !DILocation(line: 139, column: 24, scope: !5) +!109 = !DILocation(line: 140, column: 142, scope: !5) +!110 = !DILocation(line: 142, column: 24, scope: !5) +!111 = !DILocation(line: 158, column: 26, scope: !5) +!112 = !DILocation(line: 159, column: 26, scope: !5) +!113 = !DILocation(line: 162, column: 56, scope: !5) +!114 = !DILocation(line: 53, column: 4, scope: !5) diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..2219885f7430914c0f039f99c636750b678a787d --- /dev/null +++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx @@ -0,0 +1,1369 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10 +) +.reqntid 128 +{ + .reg .pred %p<7>; + .reg .b16 %rs<98>; + .reg .b32 %r<297>; + .reg .b64 %rd<107>; + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 + +// %bb.0: + ld.param.b64 %rd14, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6]; + ld.param.b64 %rd13, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5]; + ld.param.b64 %rd12, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4]; + ld.param.b64 %rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3]; + ld.param.b64 %rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2]; + ld.param.b64 %rd9, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1]; + ld.param.b64 %rd8, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28 + mov.u32 %r19, %ctaid.x; + .loc 1 23 33 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33 + shl.b32 %r1, %r19, 6; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 126; + bfe.u32 %r20, %r2, 1, 6; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r21, %r20, %r1; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + shl.b32 %r4, %r2, 2; + and.b32 %r22, %r4, 4; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + bfe.s32 %r23, %r19, 25, 1; + shr.u32 %r24, %r23, 27; + add.s32 %r25, %r21, %r24; + shr.s32 %r5, %r25, 5; + shl.b32 %r26, %r5, 15; + .loc 1 33 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43 + shl.b32 %r6, %r19, 13; + add.s32 %r27, %r26, %r6; + shl.b32 %r7, %r20, 7; + or.b32 %r28, %r27, %r7; + or.b32 %r29, %r28, %r22; + cvt.u64.u32 %rd1, %r29; + mov.b32 %r289, 0f00000000; + mov.b64 %rd102, -8; + mov.b32 %r290, %r289; + mov.b32 %r291, %r289; + mov.b32 %r292, %r289; + mov.b32 %r293, %r289; + mov.b32 %r294, %r289; + mov.b32 %r295, %r289; + mov.b32 %r296, %r289; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + add.s64 %rd19, %rd1, %rd102; + cvt.u32.u64 %r35, %rd19; + add.s32 %r36, %r35, 4104; + mad.wide.s32 %rd16, %r36, 2, %rd10; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0; + // end inline asm + mov.b32 %r32, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r30, %r32; + mov.u32 %r31, %r32; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r30, %r31 }, [ %rd16 + 0 ], %rd15; + // end inline asm + add.s32 %r37, %r35, 8; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + mad.wide.s32 %rd18, %r37, 2, %rd10; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r33, %r32; + mov.u32 %r34, %r32; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd18 + 0 ], %rd17; + // end inline asm + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + mov.b32 {%rs1, %rs2}, %r33; + cvt.f32.bf16 %r38, %rs1; + cvt.f32.bf16 %r39, %rs2; + mov.b32 {%rs3, %rs4}, %r34; + cvt.f32.bf16 %r40, %rs3; + cvt.f32.bf16 %r41, %rs4; + mov.b32 {%rs5, %rs6}, %r30; + cvt.f32.bf16 %r42, %rs5; + cvt.f32.bf16 %r43, %rs6; + mov.b32 {%rs7, %rs8}, %r31; + cvt.f32.bf16 %r44, %rs7; + cvt.f32.bf16 %r45, %rs8; + .loc 1 49 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25 + fma.rn.f32 %r296, %r45, %r45, %r296; + fma.rn.f32 %r295, %r44, %r44, %r295; + fma.rn.f32 %r294, %r43, %r43, %r294; + fma.rn.f32 %r293, %r42, %r42, %r293; + fma.rn.f32 %r292, %r41, %r41, %r292; + fma.rn.f32 %r291, %r40, %r40, %r291; + fma.rn.f32 %r290, %r39, %r39, %r290; + fma.rn.f32 %r289, %r38, %r38, %r289; + .loc 1 33 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43 + add.s64 %rd102, %rd102, 8; + setp.lt.u64 %p2, %rd102, 120; + @%p2 bra $L__BB0_1; +// %bb.2: // %__nv_rsqrtf.exit + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + and.b32 %r46, %r2, 63; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r47, %r1, %r46; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + and.b32 %r8, %r2, 64; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + shr.s32 %r48, %r1, 31; + shr.u32 %r49, %r48, 27; + add.s32 %r50, %r47, %r49; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r51, %r293, %r294; + add.f32 %r52, %r295, %r51; + add.f32 %r53, %r296, %r52; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r54, %r53, 1, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r55, %r53, %r54; +$L__tmp4: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r56, %r289, %r290; + add.f32 %r57, %r291, %r56; + add.f32 %r58, %r292, %r57; +$L__tmp5: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r59, %r58, 1, 31, -1; +$L__tmp6: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r60, %r58, %r59; +$L__tmp7: + .loc 1 63 46 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46 + shl.b32 %r61, %r5, 7; + mov.b32 %r62, 0f43000000; + .loc 1 75 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25 + div.full.f32 %r63, %r60, %r62; + .loc 1 77 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24 + add.f32 %r64, %r63, 0f358637BD; + .loc 1 78 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32 + rsqrt.approx.ftz.f32 %r9, %r64; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + shl.b32 %r65, %r3, 1; + mov.b32 %r66, global_smem; + add.s32 %r67, %r66, %r65; + st.shared.b32 [%r67], %r9; + bar.sync 0; + shl.b32 %r68, %r46, 2; + add.s32 %r69, %r66, %r68; + ld.shared.b32 %r17, [%r69]; + .loc 1 123 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24 + div.full.f32 %r70, %r55, %r62; + .loc 1 124 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24 + add.f32 %r71, %r70, 0f358637BD; + .loc 1 125 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32 + rsqrt.approx.ftz.f32 %r10, %r71; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + bar.sync 0; + st.shared.b32 [%r67], %r10; + bar.sync 0; + ld.shared.b32 %r18, [%r69]; + shl.b32 %r72, %r2, 3; + and.b32 %r73, %r72, 120; + and.b32 %r74, %r4, 384; + bfe.s32 %r75, %r2, 4, 1; + and.b32 %r76, %r75, 1032; + or.b32 %r77, %r73, %r74; + xor.b32 %r78, %r77, %r76; + add.s32 %r11, %r66, %r78; + shl.b32 %r79, %r2, 4; + and.b32 %r80, %r79, 112; + and.b32 %r81, %r72, 896; + bfe.s32 %r82, %r2, 3, 1; + and.b32 %r83, %r82, 1032; + or.b32 %r84, %r80, %r81; + or.b32 %r85, %r84, %r83; + add.s32 %r12, %r66, %r85; + xor.b32 %r86, %r85, 8; + add.s32 %r13, %r66, %r86; + setp.eq.b32 %p4, %r8, 0; + shl.b32 %r87, %r50, 10; + and.b32 %r88, %r87, -32768; + and.b32 %r89, %r75, 516; + or.b32 %r90, %r74, %r89; + or.b32 %r91, %r90, %r73; + add.s32 %r14, %r66, %r91; + xor.b32 %r92, %r91, 4; + add.s32 %r15, %r66, %r92; + and.b32 %r93, %r4, 124; + bfe.s32 %r94, %r2, 5, 1; + and.b32 %r95, %r94, 516; + shl.b32 %r96, %r8, 1; + xor.b32 %r97, %r95, %r93; + add.s32 %r98, %r66, %r96; + add.s32 %r16, %r98, %r97; + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + add.s32 %r99, %r88, %r6; + shl.b32 %r100, %r46, 7; + add.s32 %r101, %r99, %r100; + mad.wide.s32 %rd2, %r101, 2, %rd10; + add.s32 %r102, %r101, 4096; + mad.wide.s32 %rd3, %r102, 2, %rd10; + and.b32 %r103, %r2, 1; + mul.wide.u32 %rd20, %r103, 8; + add.s32 %r104, %r6, %r7; + mad.wide.s32 %rd21, %r104, 2, %rd20; + add.s64 %rd4, %rd9, %rd21; + add.s64 %rd5, %rd8, %rd21; + add.s64 %rd6, %rd14, %rd20; + mul.wide.u32 %rd22, %r103, 16; + mul.wide.s32 %rd23, %r61, 4; + or.b64 %rd24, %rd22, %rd23; + add.s64 %rd104, %rd13, %rd24; + add.s64 %rd103, %rd12, %rd24; + add.s64 %rd7, %rd11, %rd20; + mov.b64 %rd106, 0; + mov.b64 %rd105, -8; +$L__BB0_3: // =>This Inner Loop Header: Depth=1 + .loc 1 0 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43 + setp.ne.b32 %p5, %r8, 0; + .loc 1 61 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35 + add.s64 %rd101, %rd1, %rd105; + cvt.u32.u64 %r126, %rd101; + add.s32 %r127, %r126, 8; + mad.wide.s32 %rd26, %r127, 2, %rd10; + .loc 1 61 62 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62 + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0; + // end inline asm + mov.b32 %r107, 0; + mov.pred %p3, -1; + // begin inline asm + mov.u32 %r105, %r107; + mov.u32 %r106, %r107; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r105, %r106 }, [ %rd26 + 0 ], %rd25; + // end inline asm + mov.b32 {%rs42, %rs43}, %r105; + mov.b32 {%rs44, %rs45}, %r106; + .loc 1 61 115 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115 + cvt.f32.bf16 %r128, %rs42; + cvt.f32.bf16 %r129, %rs43; + cvt.f32.bf16 %r130, %rs44; + cvt.f32.bf16 %r131, %rs45; + .loc 1 62 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42 + add.s64 %rd28, %rd7, %rd106; + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r108, %r107; + mov.u32 %r109, %r107; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r108, %r109 }, [ %rd28 + 0 ], %rd27; + // end inline asm + mov.b32 {%rs46, %rs47}, %r108; + mov.b32 {%rs48, %rs49}, %r109; + .loc 1 62 95 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95 + cvt.f32.bf16 %r132, %rs46; + cvt.f32.bf16 %r133, %rs47; + cvt.f32.bf16 %r134, %rs48; + cvt.f32.bf16 %r135, %rs49; + .loc 1 63 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51 + // begin inline asm + mov.u64 %rd29, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r110, %r107; + mov.u32 %r111, %r107; + mov.u32 %r112, %r107; + mov.u32 %r113, %r107; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd103 + 0 ], %rd29; + // end inline asm + .loc 1 64 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51 + // begin inline asm + mov.u64 %rd30, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd30, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r114, %r107; + mov.u32 %r115, %r107; + mov.u32 %r116, %r107; + mov.u32 %r117, %r107; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd104 + 0 ], %rd30; + // end inline asm + bar.sync 0; + st.shared.v2.b32 [%r11], {%r114, %r116}; + st.shared.v2.b32 [%r11+512], {%r115, %r117}; + bar.sync 0; + add.s32 %r136, %r126, 4104; + .loc 1 65 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35 + mad.wide.s32 %rd32, %r136, 2, %rd10; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + add.s64 %rd34, %rd6, %rd106; + add.s64 %rd52, %rd2, %rd106; + add.s64 %rd36, %rd52, 2; + add.s64 %rd38, %rd52, 6; + add.s64 %rd40, %rd52, 10; + .loc 1 80 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35 + add.s64 %rd42, %rd52, 14; + add.s64 %rd60, %rd11, %rd106; + add.s64 %rd44, %rd60, 2; + add.s64 %rd46, %rd60, 6; + add.s64 %rd48, %rd60, 10; + .loc 1 90 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35 + add.s64 %rd50, %rd60, 14; + add.s64 %rd54, %rd52, 4; + add.s64 %rd56, %rd52, 8; + .loc 1 98 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35 + add.s64 %rd58, %rd52, 12; + add.s64 %rd62, %rd60, 4; + add.s64 %rd64, %rd60, 8; + .loc 1 111 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24 + add.s64 %rd66, %rd60, 12; + mul.f32 %r137, %r9, %r128; + mul.f32 %r138, %r9, %r129; + mul.f32 %r139, %r9, %r130; + mul.f32 %r140, %r9, %r131; + .loc 1 113 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24 + mul.f32 %r141, %r137, %r132; + mul.f32 %r142, %r138, %r133; + mul.f32 %r143, %r139, %r134; + mul.f32 %r144, %r140, %r135; + .loc 1 116 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24 + mul.f32 %r145, %r141, %r110; + mul.f32 %r146, %r142, %r111; + mul.f32 %r147, %r143, %r112; + mul.f32 %r148, %r144, %r113; + .loc 1 121 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35 + add.s64 %rd84, %rd3, %rd106; + add.s64 %rd68, %rd84, 2; + add.s64 %rd70, %rd84, 6; + add.s64 %rd72, %rd84, 10; + .loc 1 127 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35 + add.s64 %rd74, %rd84, 14; + add.s64 %rd92, %rd14, %rd106; + add.s64 %rd76, %rd92, 2; + add.s64 %rd78, %rd92, 6; + add.s64 %rd80, %rd92, 10; + .loc 1 134 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35 + add.s64 %rd82, %rd92, 14; + add.s64 %rd86, %rd84, 4; + add.s64 %rd88, %rd84, 8; + .loc 1 140 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35 + add.s64 %rd90, %rd84, 12; + add.s64 %rd94, %rd92, 4; + add.s64 %rd96, %rd92, 8; + .loc 1 161 39 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39 + add.s64 %rd98, %rd92, 12; + .loc 1 64 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51 + add.s64 %rd99, %rd5, %rd106; + ld.shared.v2.b32 {%r149, %r150}, [%r12]; + ld.shared.v2.b32 {%r151, %r152}, [%r13]; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r118, %r107; + mov.u32 %r119, %r107; + @%p3 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r118, %r119 }, [ %rd32 + 0 ], %rd31; + // end inline asm + mov.b32 {%rs50, %rs51}, %r118; + mov.b32 {%rs52, %rs53}, %r119; + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r153, %rs50; + cvt.f32.bf16 %r154, %rs51; + cvt.f32.bf16 %r155, %rs52; + cvt.f32.bf16 %r156, %rs53; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + // begin inline asm + mov.u64 %rd33, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r120, %r107; + mov.u32 %r121, %r107; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r120, %r121 }, [ %rd34 + 0 ], %rd33; + // end inline asm + mov.b32 {%rs54, %rs55}, %r120; + mov.b32 {%rs56, %rs57}, %r121; + .loc 1 66 96 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96 + cvt.f32.bf16 %r157, %rs54; + cvt.f32.bf16 %r158, %rs55; + cvt.f32.bf16 %r159, %rs56; + cvt.f32.bf16 %r160, %rs57; + .loc 1 72 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68 + // begin inline asm + mov.u64 %rd35, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0; + // end inline asm + mov.b16 %rs10, 0; + // begin inline asm + mov.u16 %rs9, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd36 + 0 ], %rd35; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd38 + 0 ], %rd37; + // end inline asm + mov.b32 %r161, {%rs9, %rs11}; + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd40 + 0 ], %rd39; + // end inline asm + // begin inline asm + mov.u64 %rd41, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd42 + 0 ], %rd41; + // end inline asm + mov.b32 %r162, {%rs12, %rs13}; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + mov.b32 {%rs58, %rs59}, %r161; + cvt.f32.bf16 %r163, %rs58; + cvt.f32.bf16 %r164, %rs59; + mov.b32 {%rs60, %rs61}, %r162; + cvt.f32.bf16 %r165, %rs60; + cvt.f32.bf16 %r166, %rs61; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r167, %r17, %r164; + mul.f32 %r168, %r17, %r163; + mul.f32 %r169, %r17, %r166; + mul.f32 %r170, %r17, %r165; + .loc 1 80 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85 + // begin inline asm + mov.u64 %rd43, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs14, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd44 + 0 ], %rd43; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd46 + 0 ], %rd45; + // end inline asm + mov.b32 %r171, {%rs14, %rs15}; + // begin inline asm + mov.u64 %rd47, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs16, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd48 + 0 ], %rd47; + // end inline asm + // begin inline asm + mov.u64 %rd49, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd50 + 0 ], %rd49; + // end inline asm + mov.b32 %r172, {%rs16, %rs17}; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + mov.b32 {%rs62, %rs63}, %r171; + cvt.f32.bf16 %r173, %rs63; + cvt.f32.bf16 %r174, %rs62; + mov.b32 {%rs64, %rs65}, %r172; + cvt.f32.bf16 %r175, %rs65; + cvt.f32.bf16 %r176, %rs64; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r177, %r168; + fma.rn.f32 %r178, %r177, %r174, 0f00000000; + neg.f32 %r179, %r167; + fma.rn.f32 %r180, %r179, %r173, 0f00000000; + neg.f32 %r181, %r170; + fma.rn.f32 %r182, %r181, %r176, 0f00000000; + neg.f32 %r183, %r169; + fma.rn.f32 %r184, %r183, %r175, 0f00000000; + .loc 1 90 64 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64 + // begin inline asm + mov.u64 %rd51, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs18, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd52 + 0 ], %rd51; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd54 + 0 ], %rd53; + // end inline asm + mov.b32 %r185, {%rs18, %rs19}; + // begin inline asm + mov.u64 %rd55, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs20, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd56 + 0 ], %rd55; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd58 + 0 ], %rd57; + // end inline asm + mov.b32 %r186, {%rs20, %rs21}; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + mov.b32 {%rs66, %rs67}, %r185; + cvt.f32.bf16 %r187, %rs66; + cvt.f32.bf16 %r188, %rs67; + mov.b32 {%rs68, %rs69}, %r186; + cvt.f32.bf16 %r189, %rs68; + cvt.f32.bf16 %r190, %rs69; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r191, %r17, %r188; + mul.f32 %r192, %r17, %r187; + mul.f32 %r193, %r17, %r190; + mul.f32 %r194, %r17, %r189; + .loc 1 98 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81 + // begin inline asm + mov.u64 %rd59, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd59, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs22, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd60 + 0 ], %rd59; + // end inline asm + // begin inline asm + mov.u64 %rd61, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd61, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd62 + 0 ], %rd61; + // end inline asm + mov.b32 %r195, {%rs22, %rs23}; + // begin inline asm + mov.u64 %rd63, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs24, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd64 + 0 ], %rd63; + // end inline asm + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs25, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd66 + 0 ], %rd65; + // end inline asm + mov.b32 %r196, {%rs24, %rs25}; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + mov.b32 {%rs70, %rs71}, %r195; + cvt.f32.bf16 %r197, %rs71; + cvt.f32.bf16 %r198, %rs70; + mov.b32 {%rs72, %rs73}, %r196; + cvt.f32.bf16 %r199, %rs73; + cvt.f32.bf16 %r200, %rs72; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r201, %r192, %r198; + mul.f32 %r202, %r191, %r197; + mul.f32 %r203, %r194, %r200; + mul.f32 %r204, %r193, %r199; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r205, %r180, %r202, %p4; + selp.f32 %r206, %r178, %r201, %p4; + selp.f32 %r207, %r184, %r204, %p4; + selp.f32 %r208, %r182, %r203, %p4; + .loc 1 116 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24 + bar.sync 0; + st.shared.v2.b32 [%r11], {%r145, %r147}; + st.shared.v2.b32 [%r11+512], {%r146, %r148}; + bar.sync 0; + ld.shared.v2.b32 {%r209, %r210}, [%r12]; + ld.shared.v2.b32 {%r211, %r212}, [%r13]; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r213, %r149, %r206, %r209; + fma.rn.f32 %r214, %r150, %r205, %r210; + fma.rn.f32 %r215, %r151, %r208, %r211; + fma.rn.f32 %r216, %r152, %r207, %r212; + .loc 1 121 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71 + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs26, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd68 + 0 ], %rd67; + // end inline asm + // begin inline asm + mov.u64 %rd69, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd69, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd70 + 0 ], %rd69; + // end inline asm + // begin inline asm + mov.u64 %rd71, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs28, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd72 + 0 ], %rd71; + // end inline asm + // begin inline asm + mov.u64 %rd73, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd73, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs29, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd74 + 0 ], %rd73; + // end inline asm + .loc 1 127 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85 + // begin inline asm + mov.u64 %rd75, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd75, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs30, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd76 + 0 ], %rd75; + // end inline asm + // begin inline asm + mov.u64 %rd77, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd77, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs31, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd78 + 0 ], %rd77; + // end inline asm + // begin inline asm + mov.u64 %rd79, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd79, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs32, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd80 + 0 ], %rd79; + // end inline asm + // begin inline asm + mov.u64 %rd81, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd81, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs33, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd82 + 0 ], %rd81; + // end inline asm + .loc 1 134 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71 + // begin inline asm + mov.u64 %rd83, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd83, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs34, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs34 }, [ %rd84 + 0 ], %rd83; + // end inline asm + // begin inline asm + mov.u64 %rd85, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd85, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs35, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs35 }, [ %rd86 + 0 ], %rd85; + // end inline asm + // begin inline asm + mov.u64 %rd87, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd87, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs36, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs36 }, [ %rd88 + 0 ], %rd87; + // end inline asm + // begin inline asm + mov.u64 %rd89, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd89, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs37, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs37 }, [ %rd90 + 0 ], %rd89; + // end inline asm + .loc 1 140 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81 + // begin inline asm + mov.u64 %rd91, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd91, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs38, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs38 }, [ %rd92 + 0 ], %rd91; + // end inline asm + // begin inline asm + mov.u64 %rd93, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd93, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs39, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs39 }, [ %rd94 + 0 ], %rd93; + // end inline asm + // begin inline asm + mov.u64 %rd95, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd95, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs40, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs40 }, [ %rd96 + 0 ], %rd95; + // end inline asm + // begin inline asm + mov.u64 %rd97, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd97, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs41, %rs10; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs41 }, [ %rd98 + 0 ], %rd97; + // end inline asm + .loc 1 151 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25 + mul.f32 %r217, %r10, %r153; + mul.f32 %r218, %r10, %r154; + mul.f32 %r219, %r10, %r155; + mul.f32 %r220, %r10, %r156; + .loc 1 153 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26 + mul.f32 %r221, %r217, %r157; + mul.f32 %r222, %r218, %r158; + mul.f32 %r223, %r219, %r159; + mul.f32 %r224, %r220, %r160; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + mul.f32 %r225, %r221, %r110; + mul.f32 %r226, %r222, %r111; + mul.f32 %r227, %r223, %r112; + mul.f32 %r228, %r224, %r113; + bar.sync 0; + st.shared.v2.b32 [%r11], {%r225, %r227}; + st.shared.v2.b32 [%r11+512], {%r226, %r228}; + bar.sync 0; + .loc 1 161 55 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55 + cvt.rn.bf16x2.f32 %r229, %r214, %r213; + cvt.rn.bf16x2.f32 %r230, %r216, %r215; + .loc 1 121 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71 + add.s64 %rd100, %rd4, %rd106; + mov.b32 %r231, {%rs26, %rs27}; + .loc 1 121 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132 + mov.b32 {%rs74, %rs75}, %r231; + cvt.f32.bf16 %r232, %rs74; + cvt.f32.bf16 %r233, %rs75; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + mul.f32 %r234, %r18, %r233; + mul.f32 %r235, %r18, %r232; + .loc 1 127 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85 + mov.b32 %r236, {%rs30, %rs31}; + .loc 1 127 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146 + mov.b32 {%rs76, %rs77}, %r236; + cvt.f32.bf16 %r237, %rs77; + cvt.f32.bf16 %r238, %rs76; + .loc 1 131 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17 + neg.f32 %r239, %r235; + fma.rn.f32 %r240, %r239, %r238, 0f00000000; + neg.f32 %r241, %r234; + fma.rn.f32 %r242, %r241, %r237, 0f00000000; + .loc 1 134 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71 + mov.b32 %r243, {%rs34, %rs35}; + .loc 1 134 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132 + mov.b32 {%rs78, %rs79}, %r243; + cvt.f32.bf16 %r244, %rs78; + cvt.f32.bf16 %r245, %rs79; + .loc 1 139 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24 + mul.f32 %r246, %r18, %r245; + mul.f32 %r247, %r18, %r244; + .loc 1 140 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81 + mov.b32 %r248, {%rs38, %rs39}; + .loc 1 140 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142 + mov.b32 {%rs80, %rs81}, %r248; + cvt.f32.bf16 %r249, %rs81; + cvt.f32.bf16 %r250, %rs80; + .loc 1 142 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24 + mul.f32 %r251, %r247, %r250; + mul.f32 %r252, %r246, %r249; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r253, %r242, %r252, %p4; + selp.f32 %r254, %r240, %r251, %p4; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + ld.shared.v2.b32 {%r255, %r256}, [%r12]; + .loc 1 159 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26 + fma.rn.f32 %r257, %r149, %r254, %r255; + fma.rn.f32 %r258, %r150, %r253, %r256; + .loc 1 162 56 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56 + cvt.rn.bf16x2.f32 %r259, %r258, %r257; + .loc 1 121 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71 + mov.b32 %r260, {%rs28, %rs29}; + .loc 1 121 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132 + mov.b32 {%rs82, %rs83}, %r260; + cvt.f32.bf16 %r261, %rs82; + cvt.f32.bf16 %r262, %rs83; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + mul.f32 %r263, %r18, %r262; + mul.f32 %r264, %r18, %r261; + .loc 1 127 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85 + mov.b32 %r265, {%rs32, %rs33}; + .loc 1 127 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146 + mov.b32 {%rs84, %rs85}, %r265; + cvt.f32.bf16 %r266, %rs85; + cvt.f32.bf16 %r267, %rs84; + .loc 1 131 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17 + neg.f32 %r268, %r264; + fma.rn.f32 %r269, %r268, %r267, 0f00000000; + neg.f32 %r270, %r263; + fma.rn.f32 %r271, %r270, %r266, 0f00000000; + .loc 1 134 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71 + mov.b32 %r272, {%rs36, %rs37}; + .loc 1 134 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132 + mov.b32 {%rs86, %rs87}, %r272; + cvt.f32.bf16 %r273, %rs86; + cvt.f32.bf16 %r274, %rs87; + .loc 1 139 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24 + mul.f32 %r275, %r18, %r274; + mul.f32 %r276, %r18, %r273; + .loc 1 140 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81 + mov.b32 %r277, {%rs40, %rs41}; + .loc 1 140 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142 + mov.b32 {%rs88, %rs89}, %r277; + cvt.f32.bf16 %r278, %rs89; + cvt.f32.bf16 %r279, %rs88; + .loc 1 142 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24 + mul.f32 %r280, %r276, %r279; + mul.f32 %r281, %r275, %r278; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r282, %r271, %r281, %p4; + selp.f32 %r283, %r269, %r280, %p4; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + ld.shared.v2.b32 {%r284, %r285}, [%r13]; + .loc 1 159 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26 + fma.rn.f32 %r286, %r151, %r283, %r284; + fma.rn.f32 %r287, %r152, %r282, %r285; + .loc 1 161 55 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55 + bar.sync 0; + st.shared.b32 [%r14], %r229; + st.shared.b32 [%r15], %r230; + bar.sync 0; + ld.shared.v2.b16 {%rs90, %rs91}, [%r16]; + ld.shared.v2.b16 {%rs92, %rs93}, [%r16+256]; + mov.b32 %r122, {%rs90, %rs92}; + mov.b32 %r123, {%rs91, %rs93}; + // begin inline asm + @%p3 st.global.v2.b32 [ %rd99 + 0 ], { %r122, %r123 }; + // end inline asm + .loc 1 162 56 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56 + cvt.rn.bf16x2.f32 %r288, %r287, %r286; + bar.sync 0; + st.shared.b32 [%r14], %r259; + st.shared.b32 [%r15], %r288; + bar.sync 0; + ld.shared.v2.b16 {%rs94, %rs95}, [%r16]; + ld.shared.v2.b16 {%rs96, %rs97}, [%r16+256]; + mov.b32 %r124, {%rs94, %rs96}; + mov.b32 %r125, {%rs95, %rs97}; + // begin inline asm + @%p3 st.global.v2.b32 [ %rd100 + 0 ], { %r124, %r125 }; + // end inline asm + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + add.s64 %rd106, %rd106, 16; + add.s64 %rd105, %rd105, 8; + add.s64 %rd104, %rd104, 32; + add.s64 %rd103, %rd103, 32; + setp.lt.u64 %p6, %rd105, 120; + @%p6 bra $L__BB0_3; +// %bb.4: + .loc 1 53 4 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4 + ret; +$L__tmp8: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 456 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 118 +.b8 113 +.b8 104 +.b8 106 +.b8 116 +.b8 121 +.b8 103 +.b8 55 +.b8 102 +.b8 118 +.b8 120 +.b8 122 +.b8 119 +.b8 116 +.b8 98 +.b8 116 +.b8 116 +.b8 52 +.b8 118 +.b8 114 +.b8 100 +.b8 107 +.b8 98 +.b8 110 +.b8 98 +.b8 54 +.b8 110 +.b8 51 +.b8 50 +.b8 102 +.b8 110 +.b8 114 +.b8 105 +.b8 106 +.b8 106 +.b8 112 +.b8 108 +.b8 51 +.b8 118 +.b8 118 +.b8 52 +.b8 99 +.b8 102 +.b8 113 +.b8 100 +.b8 52 +.b8 109 +.b8 122 +.b8 110 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 98 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 101 +.b8 103 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 119 +.b8 105 +.b8 116 +.b8 104 +.b8 95 +.b8 115 +.b8 105 +.b8 122 +.b8 101 +.b8 115 +.b8 95 +.b8 115 +.b8 116 +.b8 97 +.b8 99 +.b8 107 +.b8 95 +.b8 117 +.b8 110 +.b8 98 +.b8 105 +.b8 110 +.b8 100 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x151:0x7a DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 4 // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp4 // DW_AT_low_pc +.b64 $L__tmp7 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp4 // DW_AT_low_pc +.b64 $L__tmp7 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..cdae67b40e78911a51e4fe2359a389cd266df553 --- /dev/null +++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source @@ -0,0 +1,972 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc215 = loc(unknown) +#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc222 = loc("in_out_ptr0"(#loc)) +#loc223 = loc("in_out_ptr1"(#loc)) +#loc224 = loc("in_ptr0"(#loc)) +#loc225 = loc("in_ptr1"(#loc)) +#loc226 = loc("in_ptr2"(#loc)) +#loc227 = loc("in_ptr3"(#loc)) +#loc228 = loc("in_ptr4"(#loc)) +#loc229 = loc("xnumel"(#loc)) +#loc230 = loc("r0_numel"(#loc)) +#loc432 = loc("input"(#loc213)) +#loc433 = loc("a"(#loc218)) +#loc434 = loc("b"(#loc218)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 73728 : i32 loc(#loc231) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc232) + %xoffset = tt.get_program_id x : i32 loc(#loc233) + %xoffset_2 = arith.constant 64 : i32 loc(#loc234) + %xoffset_3 = arith.constant 64 : i32 loc(#loc234) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc235) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc236) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc237) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc237) + %xmask = arith.constant true loc(#loc238) + %xmask_8 = arith.constant dense : tensor<64x8xi1> loc(#loc238) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc239) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc240) + %x0 = arith.constant 32 : i32 loc(#loc241) + %x0_10 = arith.constant 32 : i32 loc(#loc241) + %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc241) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc241) + %x1 = arith.constant 32 : i32 loc(#loc242) + %x1_13 = arith.constant 32 : i32 loc(#loc242) + %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc242) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc242) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc243) + %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244) + %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc244) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c8_i32 = arith.constant 8 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c8_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<64x8xf32>, tensor<64x8xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc246) + %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x8xi32> loc(#loc246) + %r0_mask = arith.constant dense<128> : tensor<1x8xi32> loc(#loc247) + %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x8xi32> loc(#loc247) + %tmp0 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_27 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_28 = arith.constant dense<4096> : tensor<1x8xi32> loc(#loc248) + %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x8xi32> loc(#loc248) + %tmp0_30 = arith.constant 128 : i32 loc(#loc249) + %tmp0_31 = arith.constant 128 : i32 loc(#loc249) + %tmp0_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc249) + %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<64x1xi32> loc(#loc249) + %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc250) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc250) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<64x8xi32> loc(#loc250) + %tmp0_37 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_38 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc251) + %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<64x1xi32> loc(#loc251) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc252) + %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<64x8xi32> loc(#loc252) + %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc253) + %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc253) + %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254) + %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc254) + %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc254) + %tmp0_48 = arith.truncf %tmp0_47 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc254) + %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc254) + %tmp0_50 = arith.extf %tmp0_49 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc255) + %tmp6 = arith.constant 128 : i32 loc(#loc256) + %tmp6_51 = arith.constant 128 : i32 loc(#loc256) + %tmp6_52 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc256) + %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<64x1xi32> loc(#loc256) + %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc257) + %tmp6_55 = tt.broadcast %tmp6_53 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc257) + %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<64x8xi32> loc(#loc257) + %tmp6_57 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_58 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_59 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc258) + %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<64x1xi32> loc(#loc258) + %tmp6_61 = tt.broadcast %tmp6_60 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc259) + %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<64x8xi32> loc(#loc259) + %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc260) + %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc260) + %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261) + %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc261) + %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc261) + %tmp6_68 = arith.truncf %tmp6_67 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc261) + %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc261) + %tmp6_70 = arith.extf %tmp6_69 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc262) + %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<64x8xf32> loc(#loc263) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x8xf32> loc(#loc264) + %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc265) + %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc265) + %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<64x8xf32> loc(#loc266) + %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<64x8xf32> loc(#loc267) + %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc268) + %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc268) + scf.yield %_tmp4_72, %_tmp10_74 : tensor<64x8xf32>, tensor<64x8xf32> loc(#loc39) + } loc(#loc435) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc269) + %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc270) + %tmp10 = tt.call @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc271) + %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc272) + %c0_i32_21 = arith.constant 0 : i32 loc(#loc44) + %c8_i32_22 = arith.constant 8 : i32 loc(#loc44) + %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44) + %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44) + %6 = arith.bitcast %c8_i32_22 : i32 to i32 loc(#loc44) + %7 = ub.poison : i32 loc(#loc44) + scf.for %r0_offset = %4 to %5 step %6 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc273) + %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x8xi32> loc(#loc273) + %r0_mask = arith.constant dense<128> : tensor<1x8xi32> loc(#loc274) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x8xi32> loc(#loc274) + %r0_3 = arith.constant 2 : i32 loc(#loc275) + %r0_3_25 = arith.constant 2 : i32 loc(#loc275) + %r0_3_26 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc275) + %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x8xi32> loc(#loc275) + %r0_4 = arith.constant 2 : i32 loc(#loc276) + %r0_4_28 = arith.constant 2 : i32 loc(#loc276) + %r0_4_29 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc276) + %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x8xi32> loc(#loc276) + %tmp50 = arith.constant 128 : i32 loc(#loc277) + %tmp50_31 = arith.constant 128 : i32 loc(#loc277) + %tmp50_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc277) + %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<64x1xi32> loc(#loc277) + %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc278) + %tmp50_35 = tt.broadcast %tmp50_33 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc278) + %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<64x8xi32> loc(#loc278) + %tmp50_37 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_38 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc279) + %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<64x1xi32> loc(#loc279) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc280) + %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<64x8xi32> loc(#loc280) + %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc281) + %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc281) + %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282) + %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc282) + %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc282) + %tmp50_48 = arith.truncf %tmp50_47 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc282) + %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc282) + %tmp50_50 = arith.extf %tmp50_49 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc283) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x8x!tt.ptr> loc(#loc284) + %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x8x!tt.ptr>, tensor<1x8xi32> loc(#loc284) + %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285) + %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x8xf32> loc(#loc285) + %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x8xf32> to tensor<1x8xbf16> loc(#loc285) + %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x8x!tt.ptr> loc(#loc285) + %tmp58_56 = arith.extf %tmp58_55 : tensor<1x8xbf16> to tensor<1x8xf32> loc(#loc286) + %tmp63 = arith.constant 128 : i32 loc(#loc287) + %tmp63_57 = arith.constant 128 : i32 loc(#loc287) + %tmp63_58 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc287) + %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<64x1xi32> loc(#loc287) + %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc288) + %tmp63_61 = tt.broadcast %tmp63_59 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc288) + %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<64x8xi32> loc(#loc288) + %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc289) + %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc289) + %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290) + %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc290) + %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc290) + %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc290) + %tmp66 = arith.constant 128 : i32 loc(#loc291) + %tmp66_69 = arith.constant 128 : i32 loc(#loc291) + %tmp66_70 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc291) + %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<64x1xi32> loc(#loc291) + %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc292) + %tmp66_73 = tt.broadcast %tmp66_71 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc292) + %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<64x8xi32> loc(#loc292) + %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc293) + %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc293) + %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294) + %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc294) + %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc294) + %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc294) + %tmp96 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_81 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_82 = arith.constant dense<4096> : tensor<1x8xi32> loc(#loc295) + %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x8xi32> loc(#loc295) + %tmp96_84 = arith.constant 128 : i32 loc(#loc296) + %tmp96_85 = arith.constant 128 : i32 loc(#loc296) + %tmp96_86 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc296) + %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<64x1xi32> loc(#loc296) + %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc297) + %tmp96_89 = tt.broadcast %tmp96_87 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc297) + %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<64x8xi32> loc(#loc297) + %tmp96_91 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_92 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_93 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc298) + %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<64x1xi32> loc(#loc298) + %tmp96_95 = tt.broadcast %tmp96_94 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc299) + %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<64x8xi32> loc(#loc299) + %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc300) + %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc300) + %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301) + %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc301) + %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc301) + %tmp96_102 = arith.truncf %tmp96_101 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc301) + %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<64x8x!tt.ptr> loc(#loc301) + %tmp96_104 = arith.extf %tmp96_103 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc302) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x8x!tt.ptr> loc(#loc303) + %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x8x!tt.ptr>, tensor<1x8xi32> loc(#loc303) + %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304) + %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x8xf32> loc(#loc304) + %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x8xf32> to tensor<1x8xbf16> loc(#loc304) + %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x8x!tt.ptr> loc(#loc304) + %tmp102_110 = arith.extf %tmp102_109 : tensor<1x8xbf16> to tensor<1x8xf32> loc(#loc305) + %tmp13 = arith.constant 0 : i64 loc(#loc306) + %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306) + %tmp14 = arith.extsi %r0_3_27 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc307) + %tmp14_112 = arith.constant dense<0> : tensor<1x8xi64> loc(#loc307) + %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x8xi64> loc(#loc307) + %tmp15 = arith.constant 1 : i64 loc(#loc308) + %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308) + %tmp16 = arith.extsi %r0_3_27 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc309) + %tmp16_115 = arith.constant dense<1> : tensor<1x8xi64> loc(#loc309) + %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x8xi64> loc(#loc309) + %tmp17 = arith.constant 2 : i32 loc(#loc310) + %tmp17_117 = arith.constant 2 : i32 loc(#loc310) + %tmp17_118 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc310) + %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x8xi32> loc(#loc310) + %tmp17_120 = arith.constant 1 : i32 loc(#loc311) + %tmp17_121 = arith.constant 1 : i32 loc(#loc311) + %tmp17_122 = arith.constant dense<1> : tensor<1x8xi32> loc(#loc311) + %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x8xi32> loc(#loc311) + %tmp17_124 = arith.constant 128 : i32 loc(#loc312) + %tmp17_125 = arith.constant 128 : i32 loc(#loc312) + %tmp17_126 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc312) + %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<64x1xi32> loc(#loc312) + %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc313) + %tmp17_129 = tt.broadcast %tmp17_127 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc313) + %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<64x8xi32> loc(#loc313) + %tmp17_131 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_132 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_133 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc314) + %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<64x1xi32> loc(#loc314) + %tmp17_135 = tt.broadcast %tmp17_134 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc315) + %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<64x8xi32> loc(#loc315) + %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc316) + %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc316) + %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x8xi1> loc(#loc317) + %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318) + %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc318) + %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc318) + %tmp17_143 = arith.truncf %tmp17_142 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc318) + %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc318) + %tmp17_145 = arith.extf %tmp17_144 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc319) + %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320) + %tmp20 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc321) + %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<64x1xf32> loc(#loc321) + %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322) + %tmp22 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc323) + %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<64x1xf32> loc(#loc323) + %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc324) + %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc325) + %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<64x8xf32> loc(#loc325) + %tmp25 = arith.constant 2 : i32 loc(#loc326) + %tmp25_149 = arith.constant 2 : i32 loc(#loc326) + %tmp25_150 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc326) + %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x8xi32> loc(#loc326) + %tmp25_152 = arith.constant 1 : i32 loc(#loc327) + %tmp25_153 = arith.constant 1 : i32 loc(#loc327) + %tmp25_154 = arith.constant dense<1> : tensor<1x8xi32> loc(#loc327) + %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x8xi32> loc(#loc327) + %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc328) + %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc329) + %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc329) + %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x8xi1> loc(#loc330) + %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331) + %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc331) + %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc331) + %tmp25_163 = arith.truncf %tmp25_162 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc331) + %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc331) + %tmp25_165 = arith.extf %tmp25_164 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc332) + %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<64x8xf32> loc(#loc333) + %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334) + %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc334) + %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<64x8xf32> loc(#loc334) + %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335) + %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc335) + %tmp31 = tt.broadcast %tmp16_116 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc336) + %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc336) + %tmp32 = arith.extsi %r0_3_27 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc337) + %tmp32_170 = arith.constant dense<1> : tensor<1x8xi64> loc(#loc337) + %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x8xi64> loc(#loc337) + %tmp33 = arith.constant 2 : i64 loc(#loc338) + %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338) + %tmp34 = arith.extsi %r0_3_27 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc339) + %tmp34_173 = arith.constant dense<2> : tensor<1x8xi64> loc(#loc339) + %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x8xi64> loc(#loc339) + %tmp35 = arith.constant 2 : i32 loc(#loc340) + %tmp35_175 = arith.constant 2 : i32 loc(#loc340) + %tmp35_176 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc340) + %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x8xi32> loc(#loc340) + %tmp35_178 = arith.constant 128 : i32 loc(#loc341) + %tmp35_179 = arith.constant 128 : i32 loc(#loc341) + %tmp35_180 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc341) + %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<64x1xi32> loc(#loc341) + %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc342) + %tmp35_183 = tt.broadcast %tmp35_181 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc342) + %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<64x8xi32> loc(#loc342) + %tmp35_185 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_186 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_187 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc343) + %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<64x1xi32> loc(#loc343) + %tmp35_189 = tt.broadcast %tmp35_188 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc344) + %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<64x8xi32> loc(#loc344) + %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc345) + %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc345) + %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x8xi1> loc(#loc346) + %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347) + %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc347) + %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc347) + %tmp35_197 = arith.truncf %tmp35_196 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc347) + %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc347) + %tmp35_199 = arith.extf %tmp35_198 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc348) + %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349) + %tmp38 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc350) + %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<64x1xf32> loc(#loc350) + %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351) + %tmp40 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc352) + %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<64x1xf32> loc(#loc352) + %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc353) + %tmp42 = tt.broadcast %tmp41 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc354) + %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<64x8xf32> loc(#loc354) + %tmp43 = arith.constant 2 : i32 loc(#loc355) + %tmp43_203 = arith.constant 2 : i32 loc(#loc355) + %tmp43_204 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc355) + %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x8xi32> loc(#loc355) + %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc356) + %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc357) + %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc357) + %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x8xi1> loc(#loc358) + %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359) + %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc359) + %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc359) + %tmp43_213 = arith.truncf %tmp43_212 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc359) + %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc359) + %tmp43_215 = arith.extf %tmp43_214 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc360) + %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<64x8xf32> loc(#loc361) + %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362) + %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc362) + %tmp48 = tt.broadcast %tmp32_171 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc363) + %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc363) + %tmp49 = tt.broadcast %tmp16_116 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc364) + %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc364) + %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365) + %tmp53 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc366) + %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<64x1xf32> loc(#loc366) + %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367) + %tmp55 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc368) + %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<64x1xf32> loc(#loc368) + %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc369) + %tmp57 = tt.broadcast %tmp56 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc370) + %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<64x8xf32> loc(#loc370) + %tmp60 = tt.broadcast %tmp58_56 : tensor<1x8xf32> -> tensor<64x8xf32> loc(#loc371) + %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<64x8xf32> loc(#loc371) + %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<64x8xf32> loc(#loc372) + %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<64x8xf32> loc(#loc373) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x8xf32> loc(#loc374) + %tmp70 = arith.constant 2 : i32 loc(#loc375) + %tmp70_223 = arith.constant 2 : i32 loc(#loc375) + %tmp70_224 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc375) + %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x8xi32> loc(#loc375) + %tmp70_226 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_227 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_228 = arith.constant dense<4097> : tensor<1x8xi32> loc(#loc376) + %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x8xi32> loc(#loc376) + %tmp70_230 = arith.constant 128 : i32 loc(#loc377) + %tmp70_231 = arith.constant 128 : i32 loc(#loc377) + %tmp70_232 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc377) + %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<64x1xi32> loc(#loc377) + %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc378) + %tmp70_235 = tt.broadcast %tmp70_233 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc378) + %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<64x8xi32> loc(#loc378) + %tmp70_237 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_238 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_239 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc379) + %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<64x1xi32> loc(#loc379) + %tmp70_241 = tt.broadcast %tmp70_240 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc380) + %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<64x8xi32> loc(#loc380) + %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc381) + %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc381) + %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x8xi1> loc(#loc382) + %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383) + %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc383) + %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc383) + %tmp70_249 = arith.truncf %tmp70_248 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc383) + %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc383) + %tmp70_251 = arith.extf %tmp70_250 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc384) + %tmp72 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc385) + %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<64x1xf32> loc(#loc385) + %tmp73 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc386) + %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<64x1xf32> loc(#loc386) + %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc387) + %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc388) + %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<64x8xf32> loc(#loc388) + %tmp76 = arith.constant 2 : i32 loc(#loc389) + %tmp76_255 = arith.constant 2 : i32 loc(#loc389) + %tmp76_256 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc389) + %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x8xi32> loc(#loc389) + %tmp76_258 = arith.constant 1 : i32 loc(#loc390) + %tmp76_259 = arith.constant 1 : i32 loc(#loc390) + %tmp76_260 = arith.constant dense<1> : tensor<1x8xi32> loc(#loc390) + %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x8xi32> loc(#loc390) + %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc391) + %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc392) + %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc392) + %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x8xi1> loc(#loc393) + %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394) + %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc394) + %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc394) + %tmp76_269 = arith.truncf %tmp76_268 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc394) + %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc394) + %tmp76_271 = arith.extf %tmp76_270 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc395) + %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<64x8xf32> loc(#loc396) + %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397) + %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc397) + %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<64x8xf32> loc(#loc397) + %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398) + %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc398) + %tmp82 = tt.broadcast %tmp16_116 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc399) + %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc399) + %tmp83 = arith.constant 2 : i32 loc(#loc400) + %tmp83_276 = arith.constant 2 : i32 loc(#loc400) + %tmp83_277 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc400) + %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x8xi32> loc(#loc400) + %tmp83_279 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_280 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_281 = arith.constant dense<4096> : tensor<1x8xi32> loc(#loc401) + %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x8xi32> loc(#loc401) + %tmp83_283 = arith.constant 128 : i32 loc(#loc402) + %tmp83_284 = arith.constant 128 : i32 loc(#loc402) + %tmp83_285 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc402) + %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<64x1xi32> loc(#loc402) + %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc403) + %tmp83_288 = tt.broadcast %tmp83_286 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc403) + %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<64x8xi32> loc(#loc403) + %tmp83_290 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_291 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_292 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc404) + %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<64x1xi32> loc(#loc404) + %tmp83_294 = tt.broadcast %tmp83_293 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc405) + %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<64x8xi32> loc(#loc405) + %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc406) + %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc406) + %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x8xi1> loc(#loc407) + %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408) + %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc408) + %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc408) + %tmp83_302 = arith.truncf %tmp83_301 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc408) + %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc408) + %tmp83_304 = arith.extf %tmp83_303 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc409) + %tmp85 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc410) + %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<64x1xf32> loc(#loc410) + %tmp86 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc411) + %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<64x1xf32> loc(#loc411) + %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc412) + %tmp88 = tt.broadcast %tmp87 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc413) + %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<64x8xf32> loc(#loc413) + %tmp89 = arith.constant 2 : i32 loc(#loc414) + %tmp89_308 = arith.constant 2 : i32 loc(#loc414) + %tmp89_309 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc414) + %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x8xi32> loc(#loc414) + %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc415) + %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc416) + %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc416) + %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x8xi1> loc(#loc417) + %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418) + %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc418) + %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc418) + %tmp89_318 = arith.truncf %tmp89_317 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc418) + %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc418) + %tmp89_320 = arith.extf %tmp89_319 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc419) + %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<64x8xf32> loc(#loc420) + %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421) + %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc421) + %tmp94 = tt.broadcast %tmp32_171 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc422) + %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc422) + %tmp95 = tt.broadcast %tmp16_116 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc423) + %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc423) + %tmp98 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc424) + %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<64x1xf32> loc(#loc424) + %tmp99 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc425) + %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<64x1xf32> loc(#loc425) + %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc426) + %tmp101 = tt.broadcast %tmp100 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc427) + %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<64x8xf32> loc(#loc427) + %tmp104 = tt.broadcast %tmp102_110 : tensor<1x8xf32> -> tensor<64x8xf32> loc(#loc428) + %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<64x8xf32> loc(#loc428) + %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<64x8xf32> loc(#loc429) + %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<64x8xf32> loc(#loc430) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x8xf32> loc(#loc431) + %c128_i32 = arith.constant 128 : i32 loc(#loc204) + %c128_i32_328 = arith.constant 128 : i32 loc(#loc204) + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc204) + %8 = arith.muli %cst, %xindex_7 : tensor<64x1xi32> loc(#loc204) + %9 = tt.broadcast %r0_index_23 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc205) + %10 = tt.broadcast %8 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc205) + %11 = arith.addi %9, %10 : tensor<64x8xi32> loc(#loc205) + %12 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc206) + %13 = tt.addptr %12, %11 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc206) + %14 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc207) + %15 = arith.truncf %tmp68 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc207) + tt.store %13, %15, %14 : tensor<64x8x!tt.ptr> loc(#loc207) + %c128_i32_329 = arith.constant 128 : i32 loc(#loc208) + %c128_i32_330 = arith.constant 128 : i32 loc(#loc208) + %cst_331 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc208) + %16 = arith.muli %cst_331, %xindex_7 : tensor<64x1xi32> loc(#loc208) + %17 = tt.broadcast %r0_index_23 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc209) + %18 = tt.broadcast %16 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc209) + %19 = arith.addi %17, %18 : tensor<64x8xi32> loc(#loc209) + %20 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc210) + %21 = tt.addptr %20, %19 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc210) + %22 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc211) + %23 = arith.truncf %tmp110 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc211) + tt.store %21, %23, %22 : tensor<64x8x!tt.ptr> loc(#loc211) + } loc(#loc44) + tt.return loc(#loc212) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x8xf32> loc("input"(#loc213))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214) + tt.reduce.return %2 : f32 loc(#loc214) + }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc214) + tt.return %0 : tensor<64xf32> loc(#loc216) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc217) + tt.return %1 : tensor<64xf32> loc(#loc217) + } loc(#loc213) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc219) + tt.return %0 : f32 loc(#loc220) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc221) + tt.return %1 : f32 loc(#loc221) + } loc(#loc218) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55) +#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66) +#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81) +#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57) +#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55) +#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63) +#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95) +#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42) +#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44) +#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55) +#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66) +#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81) +#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24) +#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24) +#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32) +#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53) +#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59) +#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91) +#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42) +#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24) +#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24) +#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33) +#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43) +#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39) +#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc231 = loc("xnumel"(#loc1)) +#loc232 = loc("r0_numel"(#loc2)) +#loc233 = loc("xoffset"(#loc3)) +#loc234 = loc("xoffset"(#loc4)) +#loc235 = loc("xindex"(#loc5)) +#loc236 = loc("xindex"(#loc6)) +#loc237 = loc("xindex"(#loc7)) +#loc238 = loc("xmask"(#loc8)) +#loc239 = loc("r0_base"(#loc9)) +#loc240 = loc("r0_base"(#loc10)) +#loc241 = loc("x0"(#loc11)) +#loc242 = loc("x1"(#loc12)) +#loc243 = loc("_tmp4"(#loc13)) +#loc244 = loc("_tmp10"(#loc14)) +#loc245 = loc("_tmp4"(#loc15)) +#loc246 = loc("r0_index"(#loc16)) +#loc247 = loc("r0_mask"(#loc17)) +#loc248 = loc("tmp0"(#loc18)) +#loc249 = loc("tmp0"(#loc19)) +#loc250 = loc("tmp0"(#loc20)) +#loc251 = loc("tmp0"(#loc21)) +#loc252 = loc("tmp0"(#loc22)) +#loc253 = loc("tmp0"(#loc23)) +#loc254 = loc("tmp0"(#loc24)) +#loc255 = loc("tmp0"(#loc25)) +#loc256 = loc("tmp6"(#loc26)) +#loc257 = loc("tmp6"(#loc27)) +#loc258 = loc("tmp6"(#loc28)) +#loc259 = loc("tmp6"(#loc29)) +#loc260 = loc("tmp6"(#loc30)) +#loc261 = loc("tmp6"(#loc31)) +#loc262 = loc("tmp6"(#loc32)) +#loc263 = loc("tmp2"(#loc33)) +#loc264 = loc("tmp5"(#loc34)) +#loc265 = loc("_tmp4"(#loc35)) +#loc266 = loc("tmp8"(#loc36)) +#loc267 = loc("tmp11"(#loc37)) +#loc268 = loc("_tmp10"(#loc38)) +#loc269 = loc("tmp4"(#loc40)) +#loc270 = loc("tmp4"(#loc41)) +#loc271 = loc("tmp10"(#loc42)) +#loc272 = loc("tmp10"(#loc43)) +#loc273 = loc("r0_index"(#loc45)) +#loc274 = loc("r0_mask"(#loc46)) +#loc275 = loc("r0_3"(#loc47)) +#loc276 = loc("r0_4"(#loc48)) +#loc277 = loc("tmp50"(#loc49)) +#loc278 = loc("tmp50"(#loc50)) +#loc279 = loc("tmp50"(#loc51)) +#loc280 = loc("tmp50"(#loc52)) +#loc281 = loc("tmp50"(#loc53)) +#loc282 = loc("tmp50"(#loc54)) +#loc283 = loc("tmp50"(#loc55)) +#loc284 = loc("tmp58"(#loc56)) +#loc285 = loc("tmp58"(#loc57)) +#loc286 = loc("tmp58"(#loc58)) +#loc287 = loc("tmp63"(#loc59)) +#loc288 = loc("tmp63"(#loc60)) +#loc289 = loc("tmp63"(#loc61)) +#loc290 = loc("tmp63"(#loc62)) +#loc291 = loc("tmp66"(#loc63)) +#loc292 = loc("tmp66"(#loc64)) +#loc293 = loc("tmp66"(#loc65)) +#loc294 = loc("tmp66"(#loc66)) +#loc295 = loc("tmp96"(#loc67)) +#loc296 = loc("tmp96"(#loc68)) +#loc297 = loc("tmp96"(#loc69)) +#loc298 = loc("tmp96"(#loc70)) +#loc299 = loc("tmp96"(#loc71)) +#loc300 = loc("tmp96"(#loc72)) +#loc301 = loc("tmp96"(#loc73)) +#loc302 = loc("tmp96"(#loc74)) +#loc303 = loc("tmp102"(#loc75)) +#loc304 = loc("tmp102"(#loc76)) +#loc305 = loc("tmp102"(#loc77)) +#loc306 = loc("tmp13"(#loc78)) +#loc307 = loc("tmp14"(#loc79)) +#loc308 = loc("tmp15"(#loc80)) +#loc309 = loc("tmp16"(#loc81)) +#loc310 = loc("tmp17"(#loc82)) +#loc311 = loc("tmp17"(#loc83)) +#loc312 = loc("tmp17"(#loc84)) +#loc313 = loc("tmp17"(#loc85)) +#loc314 = loc("tmp17"(#loc86)) +#loc315 = loc("tmp17"(#loc87)) +#loc316 = loc("tmp17"(#loc88)) +#loc317 = loc("tmp17"(#loc89)) +#loc318 = loc("tmp17"(#loc90)) +#loc319 = loc("tmp17"(#loc91)) +#loc320 = loc("tmp19"(#loc92)) +#loc321 = loc("tmp20"(#loc93)) +#loc322 = loc("tmp21"(#loc94)) +#loc323 = loc("tmp22"(#loc95)) +#loc324 = loc("tmp23"(#loc96)) +#loc325 = loc("tmp24"(#loc97)) +#loc326 = loc("tmp25"(#loc98)) +#loc327 = loc("tmp25"(#loc99)) +#loc328 = loc("tmp25"(#loc100)) +#loc329 = loc("tmp25"(#loc101)) +#loc330 = loc("tmp25"(#loc102)) +#loc331 = loc("tmp25"(#loc103)) +#loc332 = loc("tmp25"(#loc104)) +#loc333 = loc("tmp27"(#loc105)) +#loc334 = loc("tmp29"(#loc106)) +#loc335 = loc("tmp30"(#loc107)) +#loc336 = loc("tmp31"(#loc108)) +#loc337 = loc("tmp32"(#loc109)) +#loc338 = loc("tmp33"(#loc110)) +#loc339 = loc("tmp34"(#loc111)) +#loc340 = loc("tmp35"(#loc112)) +#loc341 = loc("tmp35"(#loc113)) +#loc342 = loc("tmp35"(#loc114)) +#loc343 = loc("tmp35"(#loc115)) +#loc344 = loc("tmp35"(#loc116)) +#loc345 = loc("tmp35"(#loc117)) +#loc346 = loc("tmp35"(#loc118)) +#loc347 = loc("tmp35"(#loc119)) +#loc348 = loc("tmp35"(#loc120)) +#loc349 = loc("tmp37"(#loc121)) +#loc350 = loc("tmp38"(#loc122)) +#loc351 = loc("tmp39"(#loc123)) +#loc352 = loc("tmp40"(#loc124)) +#loc353 = loc("tmp41"(#loc125)) +#loc354 = loc("tmp42"(#loc126)) +#loc355 = loc("tmp43"(#loc127)) +#loc356 = loc("tmp43"(#loc128)) +#loc357 = loc("tmp43"(#loc129)) +#loc358 = loc("tmp43"(#loc130)) +#loc359 = loc("tmp43"(#loc131)) +#loc360 = loc("tmp43"(#loc132)) +#loc361 = loc("tmp45"(#loc133)) +#loc362 = loc("tmp47"(#loc134)) +#loc363 = loc("tmp48"(#loc135)) +#loc364 = loc("tmp49"(#loc136)) +#loc365 = loc("tmp52"(#loc137)) +#loc366 = loc("tmp53"(#loc138)) +#loc367 = loc("tmp54"(#loc139)) +#loc368 = loc("tmp55"(#loc140)) +#loc369 = loc("tmp56"(#loc141)) +#loc370 = loc("tmp57"(#loc142)) +#loc371 = loc("tmp60"(#loc143)) +#loc372 = loc("tmp64"(#loc144)) +#loc373 = loc("tmp67"(#loc145)) +#loc374 = loc("tmp68"(#loc146)) +#loc375 = loc("tmp70"(#loc147)) +#loc376 = loc("tmp70"(#loc148)) +#loc377 = loc("tmp70"(#loc149)) +#loc378 = loc("tmp70"(#loc150)) +#loc379 = loc("tmp70"(#loc151)) +#loc380 = loc("tmp70"(#loc152)) +#loc381 = loc("tmp70"(#loc153)) +#loc382 = loc("tmp70"(#loc154)) +#loc383 = loc("tmp70"(#loc155)) +#loc384 = loc("tmp70"(#loc156)) +#loc385 = loc("tmp72"(#loc157)) +#loc386 = loc("tmp73"(#loc158)) +#loc387 = loc("tmp74"(#loc159)) +#loc388 = loc("tmp75"(#loc160)) +#loc389 = loc("tmp76"(#loc161)) +#loc390 = loc("tmp76"(#loc162)) +#loc391 = loc("tmp76"(#loc163)) +#loc392 = loc("tmp76"(#loc164)) +#loc393 = loc("tmp76"(#loc165)) +#loc394 = loc("tmp76"(#loc166)) +#loc395 = loc("tmp76"(#loc167)) +#loc396 = loc("tmp78"(#loc168)) +#loc397 = loc("tmp80"(#loc169)) +#loc398 = loc("tmp81"(#loc170)) +#loc399 = loc("tmp82"(#loc171)) +#loc400 = loc("tmp83"(#loc172)) +#loc401 = loc("tmp83"(#loc173)) +#loc402 = loc("tmp83"(#loc174)) +#loc403 = loc("tmp83"(#loc175)) +#loc404 = loc("tmp83"(#loc176)) +#loc405 = loc("tmp83"(#loc177)) +#loc406 = loc("tmp83"(#loc178)) +#loc407 = loc("tmp83"(#loc179)) +#loc408 = loc("tmp83"(#loc180)) +#loc409 = loc("tmp83"(#loc181)) +#loc410 = loc("tmp85"(#loc182)) +#loc411 = loc("tmp86"(#loc183)) +#loc412 = loc("tmp87"(#loc184)) +#loc413 = loc("tmp88"(#loc185)) +#loc414 = loc("tmp89"(#loc186)) +#loc415 = loc("tmp89"(#loc187)) +#loc416 = loc("tmp89"(#loc188)) +#loc417 = loc("tmp89"(#loc189)) +#loc418 = loc("tmp89"(#loc190)) +#loc419 = loc("tmp89"(#loc191)) +#loc420 = loc("tmp91"(#loc192)) +#loc421 = loc("tmp93"(#loc193)) +#loc422 = loc("tmp94"(#loc194)) +#loc423 = loc("tmp95"(#loc195)) +#loc424 = loc("tmp98"(#loc196)) +#loc425 = loc("tmp99"(#loc197)) +#loc426 = loc("tmp100"(#loc198)) +#loc427 = loc("tmp101"(#loc199)) +#loc428 = loc("tmp104"(#loc200)) +#loc429 = loc("tmp107"(#loc201)) +#loc430 = loc("tmp109"(#loc202)) +#loc431 = loc("tmp110"(#loc203)) +#loc435 = loc("_tmp10"(#loc245)) diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..cf51ed3e1bf1a0cb2598cdf88fcb480be31ee33b --- /dev/null +++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir @@ -0,0 +1,547 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc147 = loc("in_out_ptr0"(#loc)) +#loc148 = loc("in_out_ptr1"(#loc)) +#loc149 = loc("in_ptr0"(#loc)) +#loc150 = loc("in_ptr1"(#loc)) +#loc151 = loc("in_ptr2"(#loc)) +#loc152 = loc("in_ptr3"(#loc)) +#loc153 = loc("in_ptr4"(#loc)) +#loc154 = loc("xnumel"(#loc)) +#loc155 = loc("r0_numel"(#loc)) +#loc185 = loc("tmp4"(#loc33)) +#loc187 = loc("tmp10"(#loc36)) +#loc292 = loc(callsite(#loc1 at #loc185)) +#loc294 = loc(callsite(#loc1 at #loc187)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4097> : tensor<1x8xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x8xbf16, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x8xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x8xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<2> : tensor<1x8xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<36864> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<36864> : tensor<64x1xi32, #blocked1> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> loc(#loc1) + %cst_8 = arith.constant dense<4096> : tensor<1x8xi32, #blocked> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x8xi32, #blocked1> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x8xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<128> : tensor<1x8xi32, #blocked1> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<32> : tensor<64x1xi32, #blocked1> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked1> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_16 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32, #blocked1> loc(#loc1) + %cst_17 = arith.constant dense<1.280000e+02> : tensor<64x1xf32, #blocked1> loc(#loc1) + %cst_18 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> loc(#loc1) + %cst_19 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc156) + %xoffset_20 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc157) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158) + %xindex_21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158) + %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc158) + %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc158) + %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc159) + %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked> loc(#loc159) + %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<64x1xi32, #blocked1> loc(#loc159) + %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<64x1xi32, #blocked> loc(#loc159) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160) + %r0_base_28 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160) + %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x8xi32, #blocked1> loc(#loc160) + %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x8xi32, #blocked> loc(#loc160) + %x0 = arith.remsi %xindex_26, %cst_13 : tensor<64x1xi32, #blocked1> loc(#loc161) + %x0_31 = arith.remsi %xindex_27, %cst_12 : tensor<64x1xi32, #blocked> loc(#loc161) + %x1 = arith.divsi %xindex_26, %cst_13 : tensor<64x1xi32, #blocked1> loc(#loc162) + %x1_32 = arith.divsi %xindex_27, %cst_12 : tensor<64x1xi32, #blocked> loc(#loc162) + %tmp0 = arith.muli %x0, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc163) + %tmp0_33 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc164) + %tmp0_34 = arith.muli %x1, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc165) + %tmp0_35 = tt.broadcast %tmp0_34 : tensor<64x1xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc166) + %tmp0_36 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked1> loc(#loc167) + %_tmp10:2 = scf.for %_tmp10_51 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg10 = %cst_19, %arg11 = %cst_19) -> (tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>) : i32 { + %r0_index = tt.splat %_tmp10_51 : i32 -> tensor<1x8xi32, #blocked1> loc(#loc169) + %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x8xi32, #blocked1> loc(#loc169) + %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x8xi32, #blocked1> loc(#loc170) + %tmp0_53 = arith.addi %r0_index_52, %cst_9 : tensor<1x8xi32, #blocked1> loc(#loc171) + %tmp0_54 = tt.broadcast %tmp0_53 : tensor<1x8xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc164) + %tmp0_55 = arith.addi %tmp0_54, %tmp0_33 : tensor<64x8xi32, #blocked1> loc(#loc164) + %tmp0_56 = arith.addi %tmp0_55, %tmp0_35 : tensor<64x8xi32, #blocked1> loc(#loc166) + %tmp0_57 = tt.addptr %tmp0_36, %tmp0_56 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc167) + %tmp0_58 = tt.broadcast %r0_mask : tensor<1x8xi1, #blocked1> -> tensor<64x8xi1, #blocked1> loc(#loc172) + %tmp0_59 = tt.load %tmp0_57, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked1> loc(#loc172) + %tmp0_60 = arith.extf %tmp0_59 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> loc(#loc173) + %tmp6 = tt.broadcast %r0_index_52 : tensor<1x8xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc174) + %tmp6_61 = arith.addi %tmp6, %tmp0_33 : tensor<64x8xi32, #blocked1> loc(#loc174) + %tmp6_62 = arith.addi %tmp6_61, %tmp0_35 : tensor<64x8xi32, #blocked1> loc(#loc175) + %tmp6_63 = tt.addptr %tmp0_36, %tmp6_62 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc176) + %tmp6_64 = tt.load %tmp6_63, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked1> loc(#loc177) + %tmp6_65 = arith.extf %tmp6_64 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> loc(#loc178) + %tmp2 = arith.mulf %tmp0_60, %tmp0_60 : tensor<64x8xf32, #blocked1> loc(#loc179) + %tmp5 = arith.addf %arg10, %tmp2 : tensor<64x8xf32, #blocked1> loc(#loc180) + %_tmp4 = arith.select %tmp0_58, %tmp5, %arg10 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1> loc(#loc181) + %tmp8 = arith.mulf %tmp6_65, %tmp6_65 : tensor<64x8xf32, #blocked1> loc(#loc182) + %tmp11 = arith.addf %arg11, %tmp8 : tensor<64x8xf32, #blocked1> loc(#loc183) + %_tmp10_66 = arith.select %tmp0_58, %tmp11, %arg11 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1> loc(#loc184) + scf.yield %_tmp4, %_tmp10_66 : tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1> loc(#loc31) + } loc(#loc290) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_51: f32 loc(callsite(#loc1 at #loc185)), %tmp4_52: f32 loc(callsite(#loc1 at #loc185))): + %tmp4_53 = arith.addf %tmp4_51, %tmp4_52 : f32 loc(#loc297) + tt.reduce.return %tmp4_53 : f32 loc(#loc291) + }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291) + %tmp4_37 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc186) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_51: f32 loc(callsite(#loc1 at #loc187)), %tmp10_52: f32 loc(callsite(#loc1 at #loc187))): + %tmp10_53 = arith.addf %tmp10_51, %tmp10_52 : f32 loc(#loc298) + tt.reduce.return %tmp10_53 : f32 loc(#loc293) + }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293) + %tmp10_38 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc188) + %tmp50 = arith.muli %x0_31, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc189) + %tmp50_39 = tt.broadcast %tmp50 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc190) + %tmp50_40 = arith.muli %x1_32, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc191) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc192) + %tmp50_42 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked> loc(#loc193) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x8x!tt.ptr, #blocked> loc(#loc194) + %tmp58_43 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x8x!tt.ptr, #blocked1> loc(#loc194) + %tmp63 = arith.muli %x1, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc195) + %tmp63_44 = tt.broadcast %tmp63 : tensor<64x1xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc196) + %tmp63_45 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked1> loc(#loc197) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked1> loc(#loc198) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x8x!tt.ptr, #blocked> loc(#loc199) + %tmp102_46 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x8x!tt.ptr, #blocked1> loc(#loc199) + %tmp20 = arith.divf %tmp10_38, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc200) + %tmp22 = arith.addf %tmp20, %cst_16 : tensor<64x1xf32, #blocked1> loc(#loc201) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc202) + %tmp24 = ttg.convert_layout %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc203) + %tmp24_47 = tt.broadcast %tmp24 : tensor<64x1xf32, #blocked> -> tensor<64x8xf32, #blocked> loc(#loc203) + %tmp24_48 = tt.broadcast %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x8xf32, #blocked1> loc(#loc203) + %tmp72 = arith.divf %tmp4_37, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc204) + %tmp73 = arith.addf %tmp72, %cst_16 : tensor<64x1xf32, #blocked1> loc(#loc205) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc206) + %tmp75 = ttg.convert_layout %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc207) + %tmp75_49 = tt.broadcast %tmp75 : tensor<64x1xf32, #blocked> -> tensor<64x8xf32, #blocked> loc(#loc207) + %tmp75_50 = tt.broadcast %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x8xf32, #blocked1> loc(#loc207) + %0 = arith.muli %xindex_26, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc57) + %1 = tt.broadcast %0 : tensor<64x1xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc58) + %2 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked1> loc(#loc59) + %3 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked1> loc(#loc60) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32, #blocked1> loc(#loc208) + %r0_index_51 = tt.splat %r0_offset : i32 -> tensor<1x8xi32, #blocked> loc(#loc208) + %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x8xi32, #blocked1> loc(#loc208) + %r0_index_53 = arith.addi %r0_index_51, %r0_base_30 : tensor<1x8xi32, #blocked> loc(#loc208) + %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x8xi32, #blocked1> loc(#loc209) + %r0_mask_54 = arith.cmpi slt, %r0_index_53, %cst_10 : tensor<1x8xi32, #blocked> loc(#loc209) + %r0_3 = arith.remsi %r0_index_53, %cst_3 : tensor<1x8xi32, #blocked> loc(#loc210) + %r0_4 = arith.divsi %r0_index_53, %cst_3 : tensor<1x8xi32, #blocked> loc(#loc211) + %tmp50_55 = tt.broadcast %r0_index_52 : tensor<1x8xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc190) + %tmp50_56 = arith.addi %tmp50_55, %tmp0_33 : tensor<64x8xi32, #blocked1> loc(#loc190) + %tmp50_57 = arith.addi %tmp50_56, %tmp0_35 : tensor<64x8xi32, #blocked1> loc(#loc192) + %tmp50_58 = tt.addptr %tmp0_36, %tmp50_57 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc193) + %tmp50_59 = tt.broadcast %r0_mask : tensor<1x8xi1, #blocked1> -> tensor<64x8xi1, #blocked1> loc(#loc212) + %tmp50_60 = tt.load %tmp50_58, %tmp50_59, %cst_14 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked1> loc(#loc212) + %tmp50_61 = arith.extf %tmp50_60 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> loc(#loc213) + %tmp58_62 = tt.addptr %tmp58_43, %r0_index_52 : tensor<1x8x!tt.ptr, #blocked1>, tensor<1x8xi32, #blocked1> loc(#loc194) + %tmp58_63 = tt.load %tmp58_62, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x8x!tt.ptr, #blocked1> loc(#loc214) + %tmp58_64 = arith.extf %tmp58_63 : tensor<1x8xbf16, #blocked1> to tensor<1x8xf32, #blocked1> loc(#loc215) + %tmp63_65 = arith.addi %tmp50_55, %tmp63_44 : tensor<64x8xi32, #blocked1> loc(#loc196) + %tmp63_66 = tt.addptr %tmp63_45, %tmp63_65 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc197) + %tmp63_67 = tt.load %tmp63_66, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked1> loc(#loc216) + %tmp66_68 = tt.addptr %tmp66, %tmp63_65 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc198) + %tmp66_69 = tt.load %tmp66_68, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked1> loc(#loc217) + %tmp66_70 = ttg.convert_layout %tmp66_69 : tensor<64x8xf32, #blocked1> -> tensor<64x8xf32, #blocked> loc(#loc217) + %tmp96 = arith.addi %r0_index_52, %cst_9 : tensor<1x8xi32, #blocked1> loc(#loc218) + %tmp96_71 = tt.broadcast %tmp96 : tensor<1x8xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc219) + %tmp96_72 = arith.addi %tmp96_71, %tmp0_33 : tensor<64x8xi32, #blocked1> loc(#loc219) + %tmp96_73 = arith.addi %tmp96_72, %tmp0_35 : tensor<64x8xi32, #blocked1> loc(#loc220) + %tmp96_74 = tt.addptr %tmp0_36, %tmp96_73 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc221) + %tmp96_75 = tt.load %tmp96_74, %tmp50_59, %cst_14 evictionPolicy = evict_first : tensor<64x8x!tt.ptr, #blocked1> loc(#loc222) + %tmp96_76 = arith.extf %tmp96_75 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> loc(#loc223) + %tmp102_77 = tt.addptr %tmp102_46, %r0_index_52 : tensor<1x8x!tt.ptr, #blocked1>, tensor<1x8xi32, #blocked1> loc(#loc199) + %tmp102_78 = tt.load %tmp102_77, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x8x!tt.ptr, #blocked1> loc(#loc224) + %tmp102_79 = arith.extf %tmp102_78 : tensor<1x8xbf16, #blocked1> to tensor<1x8xf32, #blocked1> loc(#loc225) + %tmp16 = arith.extsi %r0_3 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked> loc(#loc226) + %tmp16_80 = arith.cmpi slt, %tmp16, %cst_2 : tensor<1x8xi64, #blocked> loc(#loc226) + %tmp17 = arith.muli %r0_4, %cst_3 : tensor<1x8xi32, #blocked> loc(#loc227) + %tmp17_81 = arith.addi %tmp17, %cst_1 : tensor<1x8xi32, #blocked> loc(#loc228) + %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x8xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc229) + %tmp17_83 = arith.addi %tmp17_82, %tmp50_39 : tensor<64x8xi32, #blocked> loc(#loc229) + %tmp17_84 = arith.addi %tmp17_83, %tmp50_41 : tensor<64x8xi32, #blocked> loc(#loc230) + %tmp17_85 = tt.addptr %tmp50_42, %tmp17_84 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> loc(#loc231) + %tmp17_86 = arith.andi %r0_mask_54, %tmp16_80 : tensor<1x8xi1, #blocked> loc(#loc232) + %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc233) + %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked> loc(#loc233) + %tmp17_89 = arith.extf %tmp17_88 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc234) + %tmp24_90 = arith.mulf %tmp17_89, %tmp24_47 : tensor<64x8xf32, #blocked> loc(#loc203) + %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x8x!tt.ptr, #blocked>, tensor<1x8xi32, #blocked> loc(#loc235) + %tmp25_91 = tt.broadcast %tmp25 : tensor<1x8x!tt.ptr, #blocked> -> tensor<64x8x!tt.ptr, #blocked> loc(#loc235) + %tmp25_92 = tt.load %tmp25_91, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked> loc(#loc236) + %tmp25_93 = arith.extf %tmp25_92 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc237) + %tmp27 = arith.mulf %tmp24_90, %tmp25_93 : tensor<64x8xf32, #blocked> loc(#loc238) + %tmp29 = arith.subf %cst_18, %tmp27 : tensor<64x8xf32, #blocked> loc(#loc239) + %tmp31 = tt.broadcast %tmp16_80 : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc240) + %tmp32 = arith.cmpi sge, %tmp16, %cst_2 : tensor<1x8xi64, #blocked> loc(#loc241) + %tmp35 = tt.broadcast %tmp17 : tensor<1x8xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc242) + %tmp35_94 = arith.addi %tmp35, %tmp50_39 : tensor<64x8xi32, #blocked> loc(#loc242) + %tmp35_95 = arith.addi %tmp35_94, %tmp50_41 : tensor<64x8xi32, #blocked> loc(#loc243) + %tmp35_96 = tt.addptr %tmp50_42, %tmp35_95 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> loc(#loc244) + %tmp35_97 = arith.andi %r0_mask_54, %tmp32 : tensor<1x8xi1, #blocked> loc(#loc245) + %tmp35_98 = tt.broadcast %tmp35_97 : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc246) + %tmp35_99 = tt.load %tmp35_96, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked> loc(#loc246) + %tmp35_100 = arith.extf %tmp35_99 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc247) + %tmp42 = arith.mulf %tmp35_100, %tmp24_47 : tensor<64x8xf32, #blocked> loc(#loc248) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x8x!tt.ptr, #blocked>, tensor<1x8xi32, #blocked> loc(#loc249) + %tmp43_101 = tt.broadcast %tmp43 : tensor<1x8x!tt.ptr, #blocked> -> tensor<64x8x!tt.ptr, #blocked> loc(#loc249) + %tmp43_102 = tt.load %tmp43_101, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked> loc(#loc250) + %tmp43_103 = arith.extf %tmp43_102 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc251) + %tmp45 = arith.mulf %tmp42, %tmp43_103 : tensor<64x8xf32, #blocked> loc(#loc252) + %tmp48 = tt.broadcast %tmp32 : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc253) + %tmp48_104 = arith.select %tmp48, %tmp45, %cst_18 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc253) + %tmp49 = arith.select %tmp31, %tmp29, %tmp48_104 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc295) + %tmp57 = arith.mulf %tmp50_61, %tmp24_48 : tensor<64x8xf32, #blocked1> loc(#loc255) + %tmp60 = tt.broadcast %tmp58_64 : tensor<1x8xf32, #blocked1> -> tensor<64x8xf32, #blocked1> loc(#loc256) + %tmp60_105 = arith.mulf %tmp57, %tmp60 : tensor<64x8xf32, #blocked1> loc(#loc256) + %tmp64 = arith.mulf %tmp60_105, %tmp63_67 : tensor<64x8xf32, #blocked1> loc(#loc257) + %tmp64_106 = ttg.convert_layout %tmp64 : tensor<64x8xf32, #blocked1> -> tensor<64x8xf32, #blocked> loc(#loc257) + %tmp67 = arith.mulf %tmp49, %tmp66_70 : tensor<64x8xf32, #blocked> loc(#loc258) + %tmp68 = arith.addf %tmp64_106, %tmp67 : tensor<64x8xf32, #blocked> loc(#loc259) + %tmp70 = arith.addi %tmp17, %cst : tensor<1x8xi32, #blocked> loc(#loc260) + %tmp70_107 = tt.broadcast %tmp70 : tensor<1x8xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc261) + %tmp70_108 = arith.addi %tmp70_107, %tmp50_39 : tensor<64x8xi32, #blocked> loc(#loc261) + %tmp70_109 = arith.addi %tmp70_108, %tmp50_41 : tensor<64x8xi32, #blocked> loc(#loc262) + %tmp70_110 = tt.addptr %tmp50_42, %tmp70_109 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> loc(#loc263) + %tmp70_111 = tt.load %tmp70_110, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked> loc(#loc264) + %tmp70_112 = arith.extf %tmp70_111 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc265) + %tmp75_113 = arith.mulf %tmp70_112, %tmp75_49 : tensor<64x8xf32, #blocked> loc(#loc207) + %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x8x!tt.ptr, #blocked>, tensor<1x8xi32, #blocked> loc(#loc266) + %tmp76_114 = tt.broadcast %tmp76 : tensor<1x8x!tt.ptr, #blocked> -> tensor<64x8x!tt.ptr, #blocked> loc(#loc266) + %tmp76_115 = tt.load %tmp76_114, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked> loc(#loc267) + %tmp76_116 = arith.extf %tmp76_115 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc268) + %tmp78 = arith.mulf %tmp75_113, %tmp76_116 : tensor<64x8xf32, #blocked> loc(#loc269) + %tmp80 = arith.subf %cst_18, %tmp78 : tensor<64x8xf32, #blocked> loc(#loc270) + %tmp83 = arith.addi %tmp17, %cst_8 : tensor<1x8xi32, #blocked> loc(#loc271) + %tmp83_117 = tt.broadcast %tmp83 : tensor<1x8xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc272) + %tmp83_118 = arith.addi %tmp83_117, %tmp50_39 : tensor<64x8xi32, #blocked> loc(#loc272) + %tmp83_119 = arith.addi %tmp83_118, %tmp50_41 : tensor<64x8xi32, #blocked> loc(#loc273) + %tmp83_120 = tt.addptr %tmp50_42, %tmp83_119 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> loc(#loc274) + %tmp83_121 = tt.load %tmp83_120, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked> loc(#loc275) + %tmp83_122 = arith.extf %tmp83_121 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc276) + %tmp88 = arith.mulf %tmp83_122, %tmp75_49 : tensor<64x8xf32, #blocked> loc(#loc277) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x8x!tt.ptr, #blocked>, tensor<1x8xi32, #blocked> loc(#loc278) + %tmp89_123 = tt.broadcast %tmp89 : tensor<1x8x!tt.ptr, #blocked> -> tensor<64x8x!tt.ptr, #blocked> loc(#loc278) + %tmp89_124 = tt.load %tmp89_123, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked> loc(#loc279) + %tmp89_125 = arith.extf %tmp89_124 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc280) + %tmp91 = arith.mulf %tmp88, %tmp89_125 : tensor<64x8xf32, #blocked> loc(#loc281) + %tmp94 = arith.select %tmp48, %tmp91, %cst_18 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc282) + %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc296) + %tmp101 = arith.mulf %tmp96_76, %tmp75_50 : tensor<64x8xf32, #blocked1> loc(#loc285) + %tmp104 = tt.broadcast %tmp102_79 : tensor<1x8xf32, #blocked1> -> tensor<64x8xf32, #blocked1> loc(#loc286) + %tmp104_126 = arith.mulf %tmp101, %tmp104 : tensor<64x8xf32, #blocked1> loc(#loc286) + %tmp107 = arith.mulf %tmp104_126, %tmp63_67 : tensor<64x8xf32, #blocked1> loc(#loc287) + %tmp107_127 = ttg.convert_layout %tmp107 : tensor<64x8xf32, #blocked1> -> tensor<64x8xf32, #blocked> loc(#loc287) + %tmp109 = arith.mulf %tmp95, %tmp66_70 : tensor<64x8xf32, #blocked> loc(#loc288) + %tmp110 = arith.addf %tmp107_127, %tmp109 : tensor<64x8xf32, #blocked> loc(#loc289) + %4 = arith.addi %tmp50_55, %1 : tensor<64x8xi32, #blocked1> loc(#loc58) + %5 = tt.addptr %2, %4 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc59) + %6 = arith.truncf %tmp68 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked> loc(#loc144) + %7 = ttg.convert_layout %6 : tensor<64x8xbf16, #blocked> -> tensor<64x8xbf16, #blocked1> loc(#loc144) + tt.store %5, %7, %tmp50_59 : tensor<64x8x!tt.ptr, #blocked1> loc(#loc144) + %8 = tt.addptr %3, %4 : tensor<64x8x!tt.ptr, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc60) + %9 = arith.truncf %tmp110 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked> loc(#loc145) + %10 = ttg.convert_layout %9 : tensor<64x8xbf16, #blocked> -> tensor<64x8xbf16, #blocked1> loc(#loc145) + tt.store %8, %10, %tmp50_59 : tensor<64x8x!tt.ptr, #blocked1> loc(#loc145) + } loc(#loc61) + tt.return loc(#loc146) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc156 = loc("xoffset"(#loc2)) +#loc157 = loc("xoffset"(#loc3)) +#loc158 = loc("xindex"(#loc4)) +#loc159 = loc("xindex"(#loc5)) +#loc160 = loc("r0_base"(#loc6)) +#loc161 = loc("x0"(#loc7)) +#loc162 = loc("x1"(#loc8)) +#loc163 = loc("tmp0"(#loc9)) +#loc164 = loc("tmp0"(#loc10)) +#loc165 = loc("tmp0"(#loc11)) +#loc166 = loc("tmp0"(#loc12)) +#loc167 = loc("tmp0"(#loc13)) +#loc168 = loc("_tmp4"(#loc14)) +#loc169 = loc("r0_index"(#loc15)) +#loc170 = loc("r0_mask"(#loc16)) +#loc171 = loc("tmp0"(#loc17)) +#loc172 = loc("tmp0"(#loc18)) +#loc173 = loc("tmp0"(#loc19)) +#loc174 = loc("tmp6"(#loc20)) +#loc175 = loc("tmp6"(#loc21)) +#loc176 = loc("tmp6"(#loc22)) +#loc177 = loc("tmp6"(#loc23)) +#loc178 = loc("tmp6"(#loc24)) +#loc179 = loc("tmp2"(#loc25)) +#loc180 = loc("tmp5"(#loc26)) +#loc181 = loc("_tmp4"(#loc27)) +#loc182 = loc("tmp8"(#loc28)) +#loc183 = loc("tmp11"(#loc29)) +#loc184 = loc("_tmp10"(#loc30)) +#loc186 = loc("tmp4"(#loc35)) +#loc188 = loc("tmp10"(#loc37)) +#loc189 = loc("tmp50"(#loc38)) +#loc190 = loc("tmp50"(#loc39)) +#loc191 = loc("tmp50"(#loc40)) +#loc192 = loc("tmp50"(#loc41)) +#loc193 = loc("tmp50"(#loc42)) +#loc194 = loc("tmp58"(#loc43)) +#loc195 = loc("tmp63"(#loc44)) +#loc196 = loc("tmp63"(#loc45)) +#loc197 = loc("tmp63"(#loc46)) +#loc198 = loc("tmp66"(#loc47)) +#loc199 = loc("tmp102"(#loc48)) +#loc200 = loc("tmp20"(#loc49)) +#loc201 = loc("tmp22"(#loc50)) +#loc202 = loc("tmp23"(#loc51)) +#loc203 = loc("tmp24"(#loc52)) +#loc204 = loc("tmp72"(#loc53)) +#loc205 = loc("tmp73"(#loc54)) +#loc206 = loc("tmp74"(#loc55)) +#loc207 = loc("tmp75"(#loc56)) +#loc208 = loc("r0_index"(#loc62)) +#loc209 = loc("r0_mask"(#loc63)) +#loc210 = loc("r0_3"(#loc64)) +#loc211 = loc("r0_4"(#loc65)) +#loc212 = loc("tmp50"(#loc66)) +#loc213 = loc("tmp50"(#loc67)) +#loc214 = loc("tmp58"(#loc68)) +#loc215 = loc("tmp58"(#loc69)) +#loc216 = loc("tmp63"(#loc70)) +#loc217 = loc("tmp66"(#loc71)) +#loc218 = loc("tmp96"(#loc72)) +#loc219 = loc("tmp96"(#loc73)) +#loc220 = loc("tmp96"(#loc74)) +#loc221 = loc("tmp96"(#loc75)) +#loc222 = loc("tmp96"(#loc76)) +#loc223 = loc("tmp96"(#loc77)) +#loc224 = loc("tmp102"(#loc78)) +#loc225 = loc("tmp102"(#loc79)) +#loc226 = loc("tmp16"(#loc80)) +#loc227 = loc("tmp17"(#loc81)) +#loc228 = loc("tmp17"(#loc82)) +#loc229 = loc("tmp17"(#loc83)) +#loc230 = loc("tmp17"(#loc84)) +#loc231 = loc("tmp17"(#loc85)) +#loc232 = loc("tmp17"(#loc86)) +#loc233 = loc("tmp17"(#loc87)) +#loc234 = loc("tmp17"(#loc88)) +#loc235 = loc("tmp25"(#loc89)) +#loc236 = loc("tmp25"(#loc90)) +#loc237 = loc("tmp25"(#loc91)) +#loc238 = loc("tmp27"(#loc92)) +#loc239 = loc("tmp29"(#loc93)) +#loc240 = loc("tmp31"(#loc94)) +#loc241 = loc("tmp32"(#loc95)) +#loc242 = loc("tmp35"(#loc96)) +#loc243 = loc("tmp35"(#loc97)) +#loc244 = loc("tmp35"(#loc98)) +#loc245 = loc("tmp35"(#loc99)) +#loc246 = loc("tmp35"(#loc100)) +#loc247 = loc("tmp35"(#loc101)) +#loc248 = loc("tmp42"(#loc102)) +#loc249 = loc("tmp43"(#loc103)) +#loc250 = loc("tmp43"(#loc104)) +#loc251 = loc("tmp43"(#loc105)) +#loc252 = loc("tmp45"(#loc106)) +#loc253 = loc("tmp48"(#loc107)) +#loc254 = loc("tmp49"(#loc108)) +#loc255 = loc("tmp57"(#loc109)) +#loc256 = loc("tmp60"(#loc110)) +#loc257 = loc("tmp64"(#loc111)) +#loc258 = loc("tmp67"(#loc112)) +#loc259 = loc("tmp68"(#loc113)) +#loc260 = loc("tmp70"(#loc114)) +#loc261 = loc("tmp70"(#loc115)) +#loc262 = loc("tmp70"(#loc116)) +#loc263 = loc("tmp70"(#loc117)) +#loc264 = loc("tmp70"(#loc118)) +#loc265 = loc("tmp70"(#loc119)) +#loc266 = loc("tmp76"(#loc120)) +#loc267 = loc("tmp76"(#loc121)) +#loc268 = loc("tmp76"(#loc122)) +#loc269 = loc("tmp78"(#loc123)) +#loc270 = loc("tmp80"(#loc124)) +#loc271 = loc("tmp83"(#loc125)) +#loc272 = loc("tmp83"(#loc126)) +#loc273 = loc("tmp83"(#loc127)) +#loc274 = loc("tmp83"(#loc128)) +#loc275 = loc("tmp83"(#loc129)) +#loc276 = loc("tmp83"(#loc130)) +#loc277 = loc("tmp88"(#loc131)) +#loc278 = loc("tmp89"(#loc132)) +#loc279 = loc("tmp89"(#loc133)) +#loc280 = loc("tmp89"(#loc134)) +#loc281 = loc("tmp91"(#loc135)) +#loc282 = loc("tmp94"(#loc136)) +#loc283 = loc("tmp95"(#loc137)) +#loc284 = loc("tmp82"(#loc138)) +#loc285 = loc("tmp101"(#loc139)) +#loc286 = loc("tmp104"(#loc140)) +#loc287 = loc("tmp107"(#loc141)) +#loc288 = loc("tmp109"(#loc142)) +#loc289 = loc("tmp110"(#loc143)) +#loc290 = loc("_tmp10"(#loc168)) +#loc291 = loc(callsite(#loc32 at #loc185)) +#loc293 = loc(callsite(#loc32 at #loc187)) +#loc295 = loc(fused[#loc254, #loc240]) +#loc296 = loc(fused[#loc283, #loc284]) +#loc297 = loc(callsite(#loc34 at #loc291)) +#loc298 = loc(callsite(#loc34 at #loc293)) diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0bb7816a03bb8a7f703b5555aa6e57ba373b5ba6 --- /dev/null +++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir @@ -0,0 +1,520 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc149 = loc("in_out_ptr0"(#loc)) +#loc150 = loc("in_out_ptr1"(#loc)) +#loc151 = loc("in_ptr0"(#loc)) +#loc152 = loc("in_ptr1"(#loc)) +#loc153 = loc("in_ptr2"(#loc)) +#loc154 = loc("in_ptr3"(#loc)) +#loc155 = loc("in_ptr4"(#loc)) +#loc156 = loc("xnumel"(#loc)) +#loc157 = loc("r0_numel"(#loc)) +#loc189 = loc("tmp4"(#loc35)) +#loc191 = loc("tmp10"(#loc38)) +#loc296 = loc(callsite(#loc1 at #loc189)) +#loc298 = loc(callsite(#loc1 at #loc191)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x8xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_1 = arith.constant dense<4097> : tensor<1x8xi32> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x8xi32> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x8xi64> loc(#loc1) + %cst_6 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc1) + %cst_7 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x8xi32> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x8xi32> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc158) + %xoffset_13 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc159) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc160) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc161) + %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<64x1xi32> loc(#loc162) + %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<64x1xi32> loc(#loc162) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc163) + %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc164) + %x0 = arith.remsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc165) + %x1 = arith.divsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc166) + %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_20 = %cst_11) -> (tensor<64x8xf32>, tensor<64x8xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc168) + %r0_index_21 = arith.addi %r0_index, %r0_base_17 : tensor<1x8xi32> loc(#loc168) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst_10 : tensor<1x8xi32> loc(#loc169) + %tmp0 = arith.addi %r0_index_21, %cst_9 : tensor<1x8xi32> loc(#loc170) + %tmp0_22 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc171) + %tmp0_23 = tt.broadcast %tmp0 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc172) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc172) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<64x8xi32> loc(#loc172) + %tmp0_26 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc173) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc174) + %tmp0_28 = arith.addi %tmp0_25, %tmp0_27 : tensor<64x8xi32> loc(#loc174) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc175) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc175) + %tmp0_31 = tt.broadcast %r0_mask : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc176) + %tmp0_32 = tt.load %tmp0_30, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc176) + %tmp0_33 = arith.extf %tmp0_32 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc177) + %tmp6 = tt.broadcast %r0_index_21 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc178) + %tmp6_34 = arith.addi %tmp6, %tmp0_24 : tensor<64x8xi32> loc(#loc178) + %tmp6_35 = arith.addi %tmp6_34, %tmp0_27 : tensor<64x8xi32> loc(#loc179) + %tmp6_36 = tt.addptr %tmp0_29, %tmp6_35 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc180) + %tmp6_37 = tt.load %tmp6_36, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc181) + %tmp6_38 = arith.extf %tmp6_37 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc182) + %tmp2 = arith.mulf %tmp0_33, %tmp0_33 : tensor<64x8xf32> loc(#loc183) + %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<64x8xf32> loc(#loc184) + %_tmp4_39 = arith.select %tmp0_31, %tmp5, %_tmp4 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc185) + %tmp8 = arith.mulf %tmp6_38, %tmp6_38 : tensor<64x8xf32> loc(#loc186) + %tmp11 = arith.addf %_tmp10_20, %tmp8 : tensor<64x8xf32> loc(#loc187) + %_tmp10_40 = arith.select %tmp0_31, %tmp11, %_tmp10_20 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc188) + scf.yield %_tmp4_39, %_tmp10_40 : tensor<64x8xf32>, tensor<64x8xf32> loc(#loc33) + } loc(#loc294) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc189)), %tmp4_21: f32 loc(callsite(#loc1 at #loc189))): + %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc299) + tt.reduce.return %tmp4_22 : f32 loc(#loc295) + }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc295) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc190) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_20: f32 loc(callsite(#loc1 at #loc191)), %tmp10_21: f32 loc(callsite(#loc1 at #loc191))): + %tmp10_22 = arith.addf %tmp10_20, %tmp10_21 : f32 loc(#loc300) + tt.reduce.return %tmp10_22 : f32 loc(#loc297) + }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc297) + %tmp10_19 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc192) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc193) + %r0_index_20 = arith.addi %r0_index, %r0_base_17 : tensor<1x8xi32> loc(#loc193) + %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x8xi32> loc(#loc194) + %r0_3 = arith.remsi %r0_index_20, %cst_6 : tensor<1x8xi32> loc(#loc195) + %r0_4 = arith.divsi %r0_index_20, %cst_6 : tensor<1x8xi32> loc(#loc196) + %tmp50 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc197) + %tmp50_21 = tt.broadcast %r0_index_20 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc198) + %tmp50_22 = tt.broadcast %tmp50 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc198) + %tmp50_23 = arith.addi %tmp50_21, %tmp50_22 : tensor<64x8xi32> loc(#loc198) + %tmp50_24 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc199) + %tmp50_25 = tt.broadcast %tmp50_24 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc200) + %tmp50_26 = arith.addi %tmp50_23, %tmp50_25 : tensor<64x8xi32> loc(#loc200) + %tmp50_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc201) + %tmp50_28 = tt.addptr %tmp50_27, %tmp50_26 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc201) + %tmp50_29 = tt.broadcast %r0_mask : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc202) + %tmp50_30 = tt.load %tmp50_28, %tmp50_29, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc202) + %tmp50_31 = arith.extf %tmp50_30 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc203) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x8x!tt.ptr> loc(#loc204) + %tmp58_32 = tt.addptr %tmp58, %r0_index_20 : tensor<1x8x!tt.ptr>, tensor<1x8xi32> loc(#loc204) + %tmp58_33 = tt.load %tmp58_32, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x8x!tt.ptr> loc(#loc205) + %tmp58_34 = arith.extf %tmp58_33 : tensor<1x8xbf16> to tensor<1x8xf32> loc(#loc206) + %tmp63 = arith.muli %x1, %cst_8 : tensor<64x1xi32> loc(#loc207) + %tmp63_35 = tt.broadcast %tmp63 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc208) + %tmp63_36 = arith.addi %tmp50_21, %tmp63_35 : tensor<64x8xi32> loc(#loc208) + %tmp63_37 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc209) + %tmp63_38 = tt.addptr %tmp63_37, %tmp63_36 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc209) + %tmp63_39 = tt.load %tmp63_38, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc210) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc211) + %tmp66_40 = tt.addptr %tmp66, %tmp63_36 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc211) + %tmp66_41 = tt.load %tmp66_40, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc212) + %tmp96 = arith.addi %r0_index_20, %cst_9 : tensor<1x8xi32> loc(#loc213) + %tmp96_42 = tt.broadcast %tmp96 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc214) + %tmp96_43 = arith.addi %tmp96_42, %tmp50_22 : tensor<64x8xi32> loc(#loc214) + %tmp96_44 = arith.addi %tmp96_43, %tmp50_25 : tensor<64x8xi32> loc(#loc215) + %tmp96_45 = tt.addptr %tmp50_27, %tmp96_44 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc216) + %tmp96_46 = tt.load %tmp96_45, %tmp50_29, %cst_0 evictionPolicy = evict_first : tensor<64x8x!tt.ptr> loc(#loc217) + %tmp96_47 = arith.extf %tmp96_46 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc218) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x8x!tt.ptr> loc(#loc219) + %tmp102_48 = tt.addptr %tmp102, %r0_index_20 : tensor<1x8x!tt.ptr>, tensor<1x8xi32> loc(#loc219) + %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x8x!tt.ptr> loc(#loc220) + %tmp102_50 = arith.extf %tmp102_49 : tensor<1x8xbf16> to tensor<1x8xf32> loc(#loc221) + %tmp16 = arith.extsi %r0_3 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc222) + %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x8xi64> loc(#loc222) + %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x8xi32> loc(#loc223) + %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x8xi32> loc(#loc224) + %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc225) + %tmp17_54 = arith.addi %tmp17_53, %tmp50_22 : tensor<64x8xi32> loc(#loc225) + %tmp17_55 = arith.addi %tmp17_54, %tmp50_25 : tensor<64x8xi32> loc(#loc226) + %tmp17_56 = tt.addptr %tmp50_27, %tmp17_55 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc227) + %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x8xi1> loc(#loc228) + %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc229) + %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc229) + %tmp17_60 = arith.extf %tmp17_59 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc230) + %tmp20 = arith.divf %tmp10_19, %cst_3 : tensor<64x1xf32> loc(#loc231) + %tmp22 = arith.addf %tmp20, %cst_2 : tensor<64x1xf32> loc(#loc232) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc233) + %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc234) + %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<64x8xf32> loc(#loc234) + %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x8x!tt.ptr>, tensor<1x8xi32> loc(#loc235) + %tmp25_62 = tt.broadcast %tmp25 : tensor<1x8x!tt.ptr> -> tensor<64x8x!tt.ptr> loc(#loc235) + %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc236) + %tmp25_64 = arith.extf %tmp25_63 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc237) + %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<64x8xf32> loc(#loc238) + %tmp29 = arith.subf %cst_11, %tmp27 : tensor<64x8xf32> loc(#loc239) + %tmp31 = tt.broadcast %tmp16_51 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc240) + %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc240) + %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x8xi64> loc(#loc241) + %tmp35 = tt.broadcast %tmp17 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc242) + %tmp35_66 = arith.addi %tmp35, %tmp50_22 : tensor<64x8xi32> loc(#loc242) + %tmp35_67 = arith.addi %tmp35_66, %tmp50_25 : tensor<64x8xi32> loc(#loc243) + %tmp35_68 = tt.addptr %tmp50_27, %tmp35_67 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc244) + %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x8xi1> loc(#loc245) + %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc246) + %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc246) + %tmp35_72 = arith.extf %tmp35_71 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc247) + %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<64x8xf32> loc(#loc248) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x8x!tt.ptr>, tensor<1x8xi32> loc(#loc249) + %tmp43_73 = tt.broadcast %tmp43 : tensor<1x8x!tt.ptr> -> tensor<64x8x!tt.ptr> loc(#loc249) + %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc250) + %tmp43_75 = arith.extf %tmp43_74 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc251) + %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<64x8xf32> loc(#loc252) + %tmp48 = tt.broadcast %tmp32 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc253) + %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc253) + %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc254) + %tmp57 = arith.mulf %tmp50_31, %tmp24 : tensor<64x8xf32> loc(#loc255) + %tmp60 = tt.broadcast %tmp58_34 : tensor<1x8xf32> -> tensor<64x8xf32> loc(#loc256) + %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<64x8xf32> loc(#loc256) + %tmp64 = arith.mulf %tmp60_77, %tmp63_39 : tensor<64x8xf32> loc(#loc257) + %tmp67 = arith.mulf %tmp49, %tmp66_41 : tensor<64x8xf32> loc(#loc258) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x8xf32> loc(#loc259) + %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x8xi32> loc(#loc260) + %tmp70_78 = tt.broadcast %tmp70 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc261) + %tmp70_79 = arith.addi %tmp70_78, %tmp50_22 : tensor<64x8xi32> loc(#loc261) + %tmp70_80 = arith.addi %tmp70_79, %tmp50_25 : tensor<64x8xi32> loc(#loc262) + %tmp70_81 = tt.addptr %tmp50_27, %tmp70_80 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc263) + %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc264) + %tmp70_83 = arith.extf %tmp70_82 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc265) + %tmp72 = arith.divf %tmp4_18, %cst_3 : tensor<64x1xf32> loc(#loc266) + %tmp73 = arith.addf %tmp72, %cst_2 : tensor<64x1xf32> loc(#loc267) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc268) + %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc269) + %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<64x8xf32> loc(#loc269) + %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x8x!tt.ptr>, tensor<1x8xi32> loc(#loc270) + %tmp76_85 = tt.broadcast %tmp76 : tensor<1x8x!tt.ptr> -> tensor<64x8x!tt.ptr> loc(#loc270) + %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc271) + %tmp76_87 = arith.extf %tmp76_86 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc272) + %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<64x8xf32> loc(#loc273) + %tmp80 = arith.subf %cst_11, %tmp78 : tensor<64x8xf32> loc(#loc274) + %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc275) + %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x8xi32> loc(#loc276) + %tmp83_88 = tt.broadcast %tmp83 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc277) + %tmp83_89 = arith.addi %tmp83_88, %tmp50_22 : tensor<64x8xi32> loc(#loc277) + %tmp83_90 = arith.addi %tmp83_89, %tmp50_25 : tensor<64x8xi32> loc(#loc278) + %tmp83_91 = tt.addptr %tmp50_27, %tmp83_90 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc279) + %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc280) + %tmp83_93 = arith.extf %tmp83_92 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc281) + %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<64x8xf32> loc(#loc282) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x8x!tt.ptr>, tensor<1x8xi32> loc(#loc283) + %tmp89_94 = tt.broadcast %tmp89 : tensor<1x8x!tt.ptr> -> tensor<64x8x!tt.ptr> loc(#loc283) + %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc284) + %tmp89_96 = arith.extf %tmp89_95 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc285) + %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<64x8xf32> loc(#loc286) + %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc287) + %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc288) + %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<64x8xf32> loc(#loc289) + %tmp104 = tt.broadcast %tmp102_50 : tensor<1x8xf32> -> tensor<64x8xf32> loc(#loc290) + %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<64x8xf32> loc(#loc290) + %tmp107 = arith.mulf %tmp104_97, %tmp63_39 : tensor<64x8xf32> loc(#loc291) + %tmp109 = arith.mulf %tmp95, %tmp66_41 : tensor<64x8xf32> loc(#loc292) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x8xf32> loc(#loc293) + %0 = arith.muli %xindex_16, %cst_8 : tensor<64x1xi32> loc(#loc142) + %1 = tt.broadcast %0 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc143) + %2 = arith.addi %tmp50_21, %1 : tensor<64x8xi32> loc(#loc143) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc144) + %4 = tt.addptr %3, %2 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc144) + %5 = arith.truncf %tmp68 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc145) + tt.store %4, %5, %tmp50_29 : tensor<64x8x!tt.ptr> loc(#loc145) + %6 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc146) + %7 = tt.addptr %6, %2 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> loc(#loc146) + %8 = arith.truncf %tmp110 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc147) + tt.store %7, %8, %tmp50_29 : tensor<64x8x!tt.ptr> loc(#loc147) + } loc(#loc40) + tt.return loc(#loc148) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc158 = loc("xoffset"(#loc2)) +#loc159 = loc("xoffset"(#loc3)) +#loc160 = loc("xindex"(#loc4)) +#loc161 = loc("xindex"(#loc5)) +#loc162 = loc("xindex"(#loc6)) +#loc163 = loc("r0_base"(#loc7)) +#loc164 = loc("r0_base"(#loc8)) +#loc165 = loc("x0"(#loc9)) +#loc166 = loc("x1"(#loc10)) +#loc167 = loc("_tmp4"(#loc11)) +#loc168 = loc("r0_index"(#loc12)) +#loc169 = loc("r0_mask"(#loc13)) +#loc170 = loc("tmp0"(#loc14)) +#loc171 = loc("tmp0"(#loc15)) +#loc172 = loc("tmp0"(#loc16)) +#loc173 = loc("tmp0"(#loc17)) +#loc174 = loc("tmp0"(#loc18)) +#loc175 = loc("tmp0"(#loc19)) +#loc176 = loc("tmp0"(#loc20)) +#loc177 = loc("tmp0"(#loc21)) +#loc178 = loc("tmp6"(#loc22)) +#loc179 = loc("tmp6"(#loc23)) +#loc180 = loc("tmp6"(#loc24)) +#loc181 = loc("tmp6"(#loc25)) +#loc182 = loc("tmp6"(#loc26)) +#loc183 = loc("tmp2"(#loc27)) +#loc184 = loc("tmp5"(#loc28)) +#loc185 = loc("_tmp4"(#loc29)) +#loc186 = loc("tmp8"(#loc30)) +#loc187 = loc("tmp11"(#loc31)) +#loc188 = loc("_tmp10"(#loc32)) +#loc190 = loc("tmp4"(#loc37)) +#loc192 = loc("tmp10"(#loc39)) +#loc193 = loc("r0_index"(#loc41)) +#loc194 = loc("r0_mask"(#loc42)) +#loc195 = loc("r0_3"(#loc43)) +#loc196 = loc("r0_4"(#loc44)) +#loc197 = loc("tmp50"(#loc45)) +#loc198 = loc("tmp50"(#loc46)) +#loc199 = loc("tmp50"(#loc47)) +#loc200 = loc("tmp50"(#loc48)) +#loc201 = loc("tmp50"(#loc49)) +#loc202 = loc("tmp50"(#loc50)) +#loc203 = loc("tmp50"(#loc51)) +#loc204 = loc("tmp58"(#loc52)) +#loc205 = loc("tmp58"(#loc53)) +#loc206 = loc("tmp58"(#loc54)) +#loc207 = loc("tmp63"(#loc55)) +#loc208 = loc("tmp63"(#loc56)) +#loc209 = loc("tmp63"(#loc57)) +#loc210 = loc("tmp63"(#loc58)) +#loc211 = loc("tmp66"(#loc59)) +#loc212 = loc("tmp66"(#loc60)) +#loc213 = loc("tmp96"(#loc61)) +#loc214 = loc("tmp96"(#loc62)) +#loc215 = loc("tmp96"(#loc63)) +#loc216 = loc("tmp96"(#loc64)) +#loc217 = loc("tmp96"(#loc65)) +#loc218 = loc("tmp96"(#loc66)) +#loc219 = loc("tmp102"(#loc67)) +#loc220 = loc("tmp102"(#loc68)) +#loc221 = loc("tmp102"(#loc69)) +#loc222 = loc("tmp16"(#loc70)) +#loc223 = loc("tmp17"(#loc71)) +#loc224 = loc("tmp17"(#loc72)) +#loc225 = loc("tmp17"(#loc73)) +#loc226 = loc("tmp17"(#loc74)) +#loc227 = loc("tmp17"(#loc75)) +#loc228 = loc("tmp17"(#loc76)) +#loc229 = loc("tmp17"(#loc77)) +#loc230 = loc("tmp17"(#loc78)) +#loc231 = loc("tmp20"(#loc79)) +#loc232 = loc("tmp22"(#loc80)) +#loc233 = loc("tmp23"(#loc81)) +#loc234 = loc("tmp24"(#loc82)) +#loc235 = loc("tmp25"(#loc83)) +#loc236 = loc("tmp25"(#loc84)) +#loc237 = loc("tmp25"(#loc85)) +#loc238 = loc("tmp27"(#loc86)) +#loc239 = loc("tmp29"(#loc87)) +#loc240 = loc("tmp31"(#loc88)) +#loc241 = loc("tmp32"(#loc89)) +#loc242 = loc("tmp35"(#loc90)) +#loc243 = loc("tmp35"(#loc91)) +#loc244 = loc("tmp35"(#loc92)) +#loc245 = loc("tmp35"(#loc93)) +#loc246 = loc("tmp35"(#loc94)) +#loc247 = loc("tmp35"(#loc95)) +#loc248 = loc("tmp42"(#loc96)) +#loc249 = loc("tmp43"(#loc97)) +#loc250 = loc("tmp43"(#loc98)) +#loc251 = loc("tmp43"(#loc99)) +#loc252 = loc("tmp45"(#loc100)) +#loc253 = loc("tmp48"(#loc101)) +#loc254 = loc("tmp49"(#loc102)) +#loc255 = loc("tmp57"(#loc103)) +#loc256 = loc("tmp60"(#loc104)) +#loc257 = loc("tmp64"(#loc105)) +#loc258 = loc("tmp67"(#loc106)) +#loc259 = loc("tmp68"(#loc107)) +#loc260 = loc("tmp70"(#loc108)) +#loc261 = loc("tmp70"(#loc109)) +#loc262 = loc("tmp70"(#loc110)) +#loc263 = loc("tmp70"(#loc111)) +#loc264 = loc("tmp70"(#loc112)) +#loc265 = loc("tmp70"(#loc113)) +#loc266 = loc("tmp72"(#loc114)) +#loc267 = loc("tmp73"(#loc115)) +#loc268 = loc("tmp74"(#loc116)) +#loc269 = loc("tmp75"(#loc117)) +#loc270 = loc("tmp76"(#loc118)) +#loc271 = loc("tmp76"(#loc119)) +#loc272 = loc("tmp76"(#loc120)) +#loc273 = loc("tmp78"(#loc121)) +#loc274 = loc("tmp80"(#loc122)) +#loc275 = loc("tmp82"(#loc123)) +#loc276 = loc("tmp83"(#loc124)) +#loc277 = loc("tmp83"(#loc125)) +#loc278 = loc("tmp83"(#loc126)) +#loc279 = loc("tmp83"(#loc127)) +#loc280 = loc("tmp83"(#loc128)) +#loc281 = loc("tmp83"(#loc129)) +#loc282 = loc("tmp88"(#loc130)) +#loc283 = loc("tmp89"(#loc131)) +#loc284 = loc("tmp89"(#loc132)) +#loc285 = loc("tmp89"(#loc133)) +#loc286 = loc("tmp91"(#loc134)) +#loc287 = loc("tmp94"(#loc135)) +#loc288 = loc("tmp95"(#loc136)) +#loc289 = loc("tmp101"(#loc137)) +#loc290 = loc("tmp104"(#loc138)) +#loc291 = loc("tmp107"(#loc139)) +#loc292 = loc("tmp109"(#loc140)) +#loc293 = loc("tmp110"(#loc141)) +#loc294 = loc("_tmp10"(#loc167)) +#loc295 = loc(callsite(#loc34 at #loc189)) +#loc297 = loc(callsite(#loc34 at #loc191)) +#loc299 = loc(callsite(#loc36 at #loc295)) +#loc300 = loc(callsite(#loc36 at #loc297)) diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5da8c70ec4563eaad58dd39c369f70778d094060 --- /dev/null +++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.source", "triton_red_fused_add_mul_native_layer_norm_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttir", "triton_red_fused_add_mul_native_layer_norm_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttgir", "triton_red_fused_add_mul_native_layer_norm_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.llir", "triton_red_fused_add_mul_native_layer_norm_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ptx", "triton_red_fused_add_mul_native_layer_norm_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.cubin", "triton_red_fused_add_mul_native_layer_norm_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.json"}} \ No newline at end of file diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.cubin b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..b32ea6cb835745ced51fd66e3138059d7fcb0051 Binary files /dev/null and b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.cubin differ diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.json b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c796febb9bed30f3999a3c4067529a2757119d6c --- /dev/null +++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.json @@ -0,0 +1 @@ +{"hash": "0a6a6fb84bfa69db7f7e1fbe544fcf86c6c18bef5d2113b0846f572ecd2865ee", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_1"} \ No newline at end of file diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.llir b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..d6b74e653c2151463c36afcc2ceea324a05a08a2 --- /dev/null +++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.llir @@ -0,0 +1,565 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 { +__nv_rsqrtf.exit: + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %9 = icmp samesign ult i32 %8, 256, !dbg !9 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %11 = shl nuw nsw i32 %10, 2, !dbg !10 + %12 = and i32 %11, 2044, !dbg !10 + %13 = shl i32 %8, 12, !dbg !11 + %14 = or disjoint i32 %12, %13 + %15 = sext i32 %14 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13 + %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %18 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %16, i64 %17, i1 %9) #6, !dbg !14 + %19 = extractvalue { i32, i32 } %18, 1, !dbg !14 + %20 = bitcast i32 %19 to <2 x bfloat>, !dbg !14 + %21 = extractelement <2 x bfloat> %20, i64 1, !dbg !14 + %22 = fpext bfloat %21 to float, !dbg !15 + %23 = extractelement <2 x bfloat> %20, i64 0, !dbg !14 + %24 = fpext bfloat %23 to float, !dbg !15 + %25 = extractvalue { i32, i32 } %18, 0, !dbg !14 + %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !14 + %27 = extractelement <2 x bfloat> %26, i64 1, !dbg !14 + %28 = fpext bfloat %27 to float, !dbg !15 + %29 = extractelement <2 x bfloat> %26, i64 0, !dbg !14 + %30 = fpext bfloat %29 to float, !dbg !15 + %31 = select i1 %9, float %30, float 0.000000e+00, !dbg !16 + %32 = select i1 %9, float %28, float 0.000000e+00, !dbg !16 + %33 = select i1 %9, float %24, float 0.000000e+00, !dbg !16 + %34 = select i1 %9, float %22, float 0.000000e+00, !dbg !16 + %35 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13 + %36 = getelementptr i8, ptr addrspace(1) %35, i64 4096, !dbg !13 + %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %36, i64 %37, i1 %9) #6, !dbg !14 + %39 = extractvalue { i32, i32 } %38, 0, !dbg !14 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !14 + %41 = extractelement <2 x bfloat> %40, i64 0, !dbg !14 + %42 = fpext bfloat %41 to float, !dbg !15 + %43 = fsub float %42, %31, !dbg !17 + %44 = select i1 %9, float 2.000000e+00, float 1.000000e+00, !dbg !22 + %45 = tail call float @llvm.nvvm.div.full(float %43, float %44), !dbg !23 + %46 = fadd float %31, %45, !dbg !24 + %47 = fsub float %42, %46, !dbg !25 + %48 = fmul float %43, %47, !dbg !26 + %49 = fadd float %48, 0.000000e+00, !dbg !27 + %50 = extractelement <2 x bfloat> %40, i64 1, !dbg !14 + %51 = fpext bfloat %50 to float, !dbg !15 + %52 = fsub float %51, %32, !dbg !17 + %53 = tail call float @llvm.nvvm.div.full(float %52, float %44), !dbg !23 + %54 = fadd float %32, %53, !dbg !24 + %55 = fsub float %51, %54, !dbg !25 + %56 = fmul float %52, %55, !dbg !26 + %57 = fadd float %56, 0.000000e+00, !dbg !27 + %58 = extractvalue { i32, i32 } %38, 1, !dbg !14 + %59 = bitcast i32 %58 to <2 x bfloat>, !dbg !14 + %60 = extractelement <2 x bfloat> %59, i64 0, !dbg !14 + %61 = fpext bfloat %60 to float, !dbg !15 + %62 = fsub float %61, %33, !dbg !17 + %63 = tail call float @llvm.nvvm.div.full(float %62, float %44), !dbg !23 + %64 = fadd float %33, %63, !dbg !24 + %65 = fsub float %61, %64, !dbg !25 + %66 = fmul float %62, %65, !dbg !26 + %67 = fadd float %66, 0.000000e+00, !dbg !27 + %68 = extractelement <2 x bfloat> %59, i64 1, !dbg !14 + %69 = fpext bfloat %68 to float, !dbg !15 + %70 = fsub float %69, %34, !dbg !17 + %71 = tail call float @llvm.nvvm.div.full(float %70, float %44), !dbg !23 + %72 = fadd float %34, %71, !dbg !24 + %73 = fsub float %69, %72, !dbg !25 + %74 = fmul float %70, %73, !dbg !26 + %75 = fadd float %74, 0.000000e+00, !dbg !27 + %76 = select i1 %9, float %46, float 0.000000e+00, !dbg !16 + %77 = select i1 %9, float %54, float 0.000000e+00, !dbg !16 + %78 = select i1 %9, float %64, float 0.000000e+00, !dbg !16 + %79 = select i1 %9, float %72, float 0.000000e+00, !dbg !16 + %80 = select i1 %9, float %67, float 0.000000e+00, !dbg !28 + %81 = select i1 %9, float %75, float 0.000000e+00, !dbg !28 + %82 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %83 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %84 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %85 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %86 = and i32 %10, 511, !dbg !10 + %87 = and i32 %10, 31, !dbg !10 + %88 = lshr i32 %86, 5, !dbg !10 + %89 = fsub float %77, %76, !dbg !29 + %90 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !32 + %91 = fcmp oeq float %90, 0.000000e+00, !dbg !33 + %92 = tail call float @llvm.nvvm.div.full(float %83, float %90), !dbg !34 + %93 = select i1 %91, float 0.000000e+00, float %92, !dbg !35 + %94 = fmul float %89, %93, !dbg !36 + %95 = fadd float %76, %94, !dbg !37 + %96 = fadd float %49, %57, !dbg !38 + %97 = select i1 %9, float %96, float 0.000000e+00, !dbg !38 + %98 = fmul float %89, %89, !dbg !39 + %99 = fmul float %98, %82, !dbg !40 + %100 = fmul float %99, %93, !dbg !41 + %101 = fadd float %97, %100, !dbg !42 + %102 = fsub float %78, %95, !dbg !29 + %103 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !32 + %104 = fcmp oeq float %103, 0.000000e+00, !dbg !33 + %105 = tail call float @llvm.nvvm.div.full(float %84, float %103), !dbg !34 + %106 = select i1 %104, float 0.000000e+00, float %105, !dbg !35 + %107 = fmul float %106, %102, !dbg !36 + %108 = fadd float %95, %107, !dbg !37 + %109 = fadd float %80, %101, !dbg !38 + %110 = fmul float %102, %102, !dbg !39 + %111 = fmul float %90, %110, !dbg !40 + %112 = fmul float %106, %111, !dbg !41 + %113 = fadd float %109, %112, !dbg !42 + %114 = fsub float %79, %108, !dbg !29 + %115 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !32 + %116 = fcmp oeq float %115, 0.000000e+00, !dbg !33 + %117 = tail call float @llvm.nvvm.div.full(float %85, float %115), !dbg !34 + %118 = select i1 %116, float 0.000000e+00, float %117, !dbg !35 + %119 = fmul float %118, %114, !dbg !36 + %120 = fadd float %108, %119, !dbg !37 + %121 = fadd float %81, %113, !dbg !38 + %122 = fmul float %114, %114, !dbg !39 + %123 = fmul float %103, %122, !dbg !40 + %124 = fmul float %118, %123, !dbg !41 + %125 = fadd float %121, %124, !dbg !42 + %126 = bitcast float %120 to i32, !dbg !30 + %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 16, i32 31), !dbg !30 + %128 = bitcast i32 %127 to float, !dbg !30 + %129 = bitcast float %125 to i32, !dbg !30 + %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 16, i32 31), !dbg !30 + %131 = bitcast i32 %130 to float, !dbg !30 + %132 = bitcast float %115 to i32, !dbg !30 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !30 + %134 = bitcast i32 %133 to float, !dbg !30 + %135 = fsub float %128, %120, !dbg !29 + %136 = fadd float %115, %134, !dbg !32 + %137 = fcmp oeq float %136, 0.000000e+00, !dbg !33 + %138 = tail call float @llvm.nvvm.div.full(float %134, float %136), !dbg !34 + %139 = select i1 %137, float 0.000000e+00, float %138, !dbg !35 + %140 = fmul float %139, %135, !dbg !36 + %141 = fadd float %120, %140, !dbg !37 + %142 = fadd float %125, %131, !dbg !38 + %143 = fmul float %135, %135, !dbg !39 + %144 = fmul float %115, %143, !dbg !40 + %145 = fmul float %139, %144, !dbg !41 + %146 = fadd float %142, %145, !dbg !42 + %147 = bitcast float %141 to i32, !dbg !30 + %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %147, i32 8, i32 31), !dbg !30 + %149 = bitcast i32 %148 to float, !dbg !30 + %150 = bitcast float %146 to i32, !dbg !30 + %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 8, i32 31), !dbg !30 + %152 = bitcast i32 %151 to float, !dbg !30 + %153 = bitcast float %136 to i32, !dbg !30 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !30 + %155 = bitcast i32 %154 to float, !dbg !30 + %156 = fsub float %149, %141, !dbg !29 + %157 = fadd float %136, %155, !dbg !32 + %158 = fcmp oeq float %157, 0.000000e+00, !dbg !33 + %159 = tail call float @llvm.nvvm.div.full(float %155, float %157), !dbg !34 + %160 = select i1 %158, float 0.000000e+00, float %159, !dbg !35 + %161 = fmul float %156, %160, !dbg !36 + %162 = fadd float %141, %161, !dbg !37 + %163 = fadd float %146, %152, !dbg !38 + %164 = fmul float %156, %156, !dbg !39 + %165 = fmul float %136, %164, !dbg !40 + %166 = fmul float %160, %165, !dbg !41 + %167 = fadd float %163, %166, !dbg !42 + %168 = bitcast float %162 to i32, !dbg !30 + %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 4, i32 31), !dbg !30 + %170 = bitcast i32 %169 to float, !dbg !30 + %171 = bitcast float %167 to i32, !dbg !30 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 4, i32 31), !dbg !30 + %173 = bitcast i32 %172 to float, !dbg !30 + %174 = bitcast float %157 to i32, !dbg !30 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !30 + %176 = bitcast i32 %175 to float, !dbg !30 + %177 = fsub float %170, %162, !dbg !29 + %178 = fadd float %157, %176, !dbg !32 + %179 = fcmp oeq float %178, 0.000000e+00, !dbg !33 + %180 = tail call float @llvm.nvvm.div.full(float %176, float %178), !dbg !34 + %181 = select i1 %179, float 0.000000e+00, float %180, !dbg !35 + %182 = fmul float %177, %181, !dbg !36 + %183 = fadd float %162, %182, !dbg !37 + %184 = fadd float %167, %173, !dbg !38 + %185 = fmul float %177, %177, !dbg !39 + %186 = fmul float %157, %185, !dbg !40 + %187 = fmul float %181, %186, !dbg !41 + %188 = fadd float %184, %187, !dbg !42 + %189 = bitcast float %183 to i32, !dbg !30 + %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 2, i32 31), !dbg !30 + %191 = bitcast i32 %190 to float, !dbg !30 + %192 = bitcast float %188 to i32, !dbg !30 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 2, i32 31), !dbg !30 + %194 = bitcast i32 %193 to float, !dbg !30 + %195 = bitcast float %178 to i32, !dbg !30 + %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !30 + %197 = bitcast i32 %196 to float, !dbg !30 + %198 = fsub float %191, %183, !dbg !29 + %199 = fadd float %178, %197, !dbg !32 + %200 = fcmp oeq float %199, 0.000000e+00, !dbg !33 + %201 = tail call float @llvm.nvvm.div.full(float %197, float %199), !dbg !34 + %202 = select i1 %200, float 0.000000e+00, float %201, !dbg !35 + %203 = fmul float %198, %202, !dbg !36 + %204 = fadd float %183, %203, !dbg !37 + %205 = fadd float %188, %194, !dbg !38 + %206 = fmul float %198, %198, !dbg !39 + %207 = fmul float %178, %206, !dbg !40 + %208 = fmul float %202, %207, !dbg !41 + %209 = fadd float %205, %208, !dbg !42 + %210 = bitcast float %204 to i32, !dbg !30 + %211 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %210, i32 1, i32 31), !dbg !30 + %212 = bitcast i32 %211 to float, !dbg !30 + %213 = bitcast float %209 to i32, !dbg !30 + %214 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !30 + %215 = bitcast i32 %214 to float, !dbg !30 + %216 = bitcast float %199 to i32, !dbg !30 + %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !30 + %218 = bitcast i32 %217 to float, !dbg !30 + %219 = fsub float %212, %204, !dbg !29 + %220 = fadd float %199, %218, !dbg !32 + %221 = fcmp oeq float %220, 0.000000e+00, !dbg !33 + %222 = tail call float @llvm.nvvm.div.full(float %218, float %220), !dbg !34 + %223 = select i1 %221, float 0.000000e+00, float %222, !dbg !35 + %224 = fmul float %219, %223, !dbg !36 + %225 = fadd float %204, %224, !dbg !37 + %226 = fadd float %209, %215, !dbg !38 + %227 = fmul float %219, %219, !dbg !39 + %228 = fmul float %199, %227, !dbg !40 + %229 = fmul float %223, %228, !dbg !41 + %230 = fadd float %226, %229, !dbg !42 + %231 = icmp eq i32 %87, 0, !dbg !30 + %232 = getelementptr float, ptr addrspace(3) @global_smem, i32 %88, !dbg !30 + %233 = bitcast float %225 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %232, <1 x i32> %233, i1 %231) #6, !dbg !30 + %234 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %88, !dbg !30 + %235 = bitcast float %230 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %234, <1 x i32> %235, i1 %231) #6, !dbg !30 + %236 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %88, !dbg !30 + %237 = bitcast float %220 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %236, <1 x i32> %237, i1 %231) #6, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30 + %238 = icmp samesign ult i32 %86, 16, !dbg !30 + %239 = getelementptr float, ptr addrspace(3) @global_smem, i32 %86, !dbg !30 + %240 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %239, i1 %238) #6, !dbg !30 + %241 = bitcast i32 %240 to float, !dbg !30 + %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %86, !dbg !30 + %243 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %242, i1 %238) #6, !dbg !30 + %244 = bitcast i32 %243 to float, !dbg !30 + %245 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %86, !dbg !30 + %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %238) #6, !dbg !30 + %247 = bitcast i32 %246 to float, !dbg !30 + %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 8, i32 31), !dbg !30 + %249 = bitcast i32 %248 to float, !dbg !30 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 8, i32 31), !dbg !30 + %251 = bitcast i32 %250 to float, !dbg !30 + %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !30 + %253 = bitcast i32 %252 to float, !dbg !30 + %254 = fsub float %249, %241, !dbg !29 + %255 = fadd float %247, %253, !dbg !32 + %256 = fcmp oeq float %255, 0.000000e+00, !dbg !33 + %257 = tail call float @llvm.nvvm.div.full(float %253, float %255), !dbg !34 + %258 = select i1 %256, float 0.000000e+00, float %257, !dbg !35 + %259 = fmul float %254, %258, !dbg !36 + %260 = fadd float %259, %241, !dbg !37 + %261 = fadd float %244, %251, !dbg !38 + %262 = fmul float %254, %254, !dbg !39 + %263 = fmul float %262, %247, !dbg !40 + %264 = fmul float %263, %258, !dbg !41 + %265 = fadd float %261, %264, !dbg !42 + %266 = bitcast float %260 to i32, !dbg !30 + %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 4, i32 31), !dbg !30 + %268 = bitcast i32 %267 to float, !dbg !30 + %269 = bitcast float %265 to i32, !dbg !30 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !30 + %271 = bitcast i32 %270 to float, !dbg !30 + %272 = bitcast float %255 to i32, !dbg !30 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !30 + %274 = bitcast i32 %273 to float, !dbg !30 + %275 = fsub float %268, %260, !dbg !29 + %276 = fadd float %255, %274, !dbg !32 + %277 = fcmp oeq float %276, 0.000000e+00, !dbg !33 + %278 = tail call float @llvm.nvvm.div.full(float %274, float %276), !dbg !34 + %279 = select i1 %277, float 0.000000e+00, float %278, !dbg !35 + %280 = fmul float %275, %279, !dbg !36 + %281 = fadd float %260, %280, !dbg !37 + %282 = fadd float %265, %271, !dbg !38 + %283 = fmul float %275, %275, !dbg !39 + %284 = fmul float %255, %283, !dbg !40 + %285 = fmul float %279, %284, !dbg !41 + %286 = fadd float %282, %285, !dbg !42 + %287 = bitcast float %281 to i32, !dbg !30 + %288 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %287, i32 2, i32 31), !dbg !30 + %289 = bitcast i32 %288 to float, !dbg !30 + %290 = bitcast float %286 to i32, !dbg !30 + %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 2, i32 31), !dbg !30 + %292 = bitcast i32 %291 to float, !dbg !30 + %293 = bitcast float %276 to i32, !dbg !30 + %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !30 + %295 = bitcast i32 %294 to float, !dbg !30 + %296 = fsub float %289, %281, !dbg !29 + %297 = fadd float %276, %295, !dbg !32 + %298 = fcmp oeq float %297, 0.000000e+00, !dbg !33 + %299 = tail call float @llvm.nvvm.div.full(float %295, float %297), !dbg !34 + %300 = select i1 %298, float 0.000000e+00, float %299, !dbg !35 + %301 = fmul float %296, %300, !dbg !36 + %302 = fadd float %281, %301, !dbg !37 + %303 = fadd float %286, %292, !dbg !38 + %304 = fmul float %296, %296, !dbg !39 + %305 = fmul float %276, %304, !dbg !40 + %306 = fmul float %300, %305, !dbg !41 + %307 = fadd float %303, %306, !dbg !42 + %308 = bitcast float %302 to i32, !dbg !30 + %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 1, i32 31), !dbg !30 + %310 = bitcast i32 %309 to float, !dbg !30 + %311 = bitcast float %307 to i32, !dbg !30 + %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 1, i32 31), !dbg !30 + %313 = bitcast i32 %312 to float, !dbg !30 + %314 = bitcast float %297 to i32, !dbg !30 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !30 + %316 = bitcast i32 %315 to float, !dbg !30 + %317 = fsub float %310, %302, !dbg !29 + %318 = fadd float %297, %316, !dbg !32 + %319 = fcmp oeq float %318, 0.000000e+00, !dbg !33 + %320 = tail call float @llvm.nvvm.div.full(float %316, float %318), !dbg !34 + %321 = select i1 %319, float 0.000000e+00, float %320, !dbg !35 + %322 = fmul float %317, %321, !dbg !36 + %323 = fadd float %302, %322, !dbg !37 + %324 = fadd float %307, %313, !dbg !38 + %325 = fmul float %317, %317, !dbg !39 + %326 = fmul float %297, %325, !dbg !40 + %327 = fmul float %321, %326, !dbg !41 + %328 = fadd float %324, %327, !dbg !42 + %329 = and i32 %10, 15, !dbg !30 + %330 = icmp eq i32 %329, 0, !dbg !30 + %331 = and i1 %238, %330, !dbg !30 + %332 = bitcast float %323 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %239, <1 x i32> %332, i1 %331) #6, !dbg !30 + %333 = bitcast float %328 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %333, i1 %331) #6, !dbg !30 + %334 = bitcast float %318 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %334, i1 %331) #6, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30 + %335 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !30 + %336 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !30 + %337 = tail call float @llvm.nvvm.div.full(float %336, float 4.096000e+03), !dbg !43 + %338 = fadd float %337, 0x3EB0C6F7A0000000, !dbg !44 + %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i15 = icmp eq i32 %342, 0, !dbg !45 + br i1 %.not.i15, label %345, label %343, !dbg !45 + +343: ; preds = %__nv_rsqrtf.exit + %344 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !45 + br label %__nv_rsqrtf.exit17, !dbg !45 + +345: ; preds = %__nv_rsqrtf.exit + %346 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !45 + br label %__nv_rsqrtf.exit17, !dbg !45 + +__nv_rsqrtf.exit17: ; preds = %343, %345 + %.0.i16 = phi float [ %344, %343 ], [ %346, %345 ], !dbg !45 + %347 = zext nneg i32 %12 to i64, !dbg !46 + %348 = sext i32 %13 to i64, !dbg !46 + %349 = getelementptr bfloat, ptr addrspace(1) %1, i64 %347, !dbg !47 + %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48 + %351 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !48 + %352 = extractvalue { i32, i32 } %351, 0, !dbg !48 + %353 = bitcast i32 %352 to <2 x bfloat>, !dbg !48 + %354 = extractvalue { i32, i32 } %351, 1, !dbg !48 + %355 = bitcast i32 %354 to <2 x bfloat>, !dbg !48 + %356 = or disjoint i64 %347, %348, !dbg !49 + %357 = getelementptr bfloat, ptr addrspace(1) %0, i64 %356, !dbg !50 + %358 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %359 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %357, i64 %358, i1 %9) #6, !dbg !51 + %360 = extractvalue { i32, i32 } %359, 0, !dbg !51 + %361 = bitcast i32 %360 to <2 x bfloat>, !dbg !51 + %362 = extractvalue { i32, i32 } %359, 1, !dbg !51 + %363 = bitcast i32 %362 to <2 x bfloat>, !dbg !51 + %364 = getelementptr bfloat, ptr addrspace(1) %2, i64 %347, !dbg !52 + %365 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53 + %366 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %364, i64 %365, i1 true) #6, !dbg !53 + %367 = extractvalue { i32, i32 } %366, 0, !dbg !53 + %368 = bitcast i32 %367 to <2 x bfloat>, !dbg !53 + %369 = extractvalue { i32, i32 } %366, 1, !dbg !53 + %370 = bitcast i32 %369 to <2 x bfloat>, !dbg !53 + %371 = getelementptr bfloat, ptr addrspace(1) %3, i64 %356, !dbg !54 + %372 = fpext <2 x bfloat> %353 to <2 x float>, !dbg !55 + %373 = fpext <2 x bfloat> %361 to <2 x float>, !dbg !56 + %374 = fpext <2 x bfloat> %368 to <2 x float>, !dbg !57 + %375 = fadd <2 x float> %372, splat (float 1.000000e+00), !dbg !58 + %376 = insertelement <2 x float> poison, float %335, i64 0, !dbg !59 + %377 = shufflevector <2 x float> %376, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !59 + %378 = fsub <2 x float> %373, %377, !dbg !59 + %379 = insertelement <2 x float> poison, float %.0.i16, i64 0, !dbg !60 + %380 = shufflevector <2 x float> %379, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !60 + %381 = fmul <2 x float> %380, %378, !dbg !60 + %382 = fmul <2 x float> %375, %381, !dbg !61 + %383 = fadd <2 x float> %382, %374, !dbg !62 + %384 = fptrunc <2 x float> %383 to <2 x bfloat>, !dbg !63 + %385 = fpext <2 x bfloat> %355 to <2 x float>, !dbg !55 + %386 = fpext <2 x bfloat> %363 to <2 x float>, !dbg !56 + %387 = fpext <2 x bfloat> %370 to <2 x float>, !dbg !57 + %388 = fadd <2 x float> %385, splat (float 1.000000e+00), !dbg !58 + %389 = fsub <2 x float> %386, %377, !dbg !59 + %390 = fmul <2 x float> %380, %389, !dbg !60 + %391 = fmul <2 x float> %388, %390, !dbg !61 + %392 = fadd <2 x float> %391, %387, !dbg !62 + %393 = fptrunc <2 x float> %392 to <2 x bfloat>, !dbg !63 + %394 = bitcast <2 x bfloat> %384 to i32, !dbg !63 + %395 = bitcast <2 x bfloat> %393 to i32, !dbg !63 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %394, i32 %395, ptr addrspace(1) %371, i1 %9) #6, !dbg !63 + %396 = or disjoint i64 %347, 2048, !dbg !64 + %397 = getelementptr bfloat, ptr addrspace(1) %1, i64 %396, !dbg !47 + %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48 + %399 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %397, i64 %398, i1 true) #6, !dbg !48 + %400 = extractvalue { i32, i32 } %399, 0, !dbg !48 + %401 = bitcast i32 %400 to <2 x bfloat>, !dbg !48 + %402 = extractvalue { i32, i32 } %399, 1, !dbg !48 + %403 = bitcast i32 %402 to <2 x bfloat>, !dbg !48 + %404 = or disjoint i64 %396, %348, !dbg !49 + %405 = getelementptr bfloat, ptr addrspace(1) %0, i64 %404, !dbg !50 + %406 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %407 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %405, i64 %406, i1 %9) #6, !dbg !51 + %408 = extractvalue { i32, i32 } %407, 0, !dbg !51 + %409 = bitcast i32 %408 to <2 x bfloat>, !dbg !51 + %410 = extractvalue { i32, i32 } %407, 1, !dbg !51 + %411 = bitcast i32 %410 to <2 x bfloat>, !dbg !51 + %412 = getelementptr bfloat, ptr addrspace(1) %2, i64 %396, !dbg !52 + %413 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53 + %414 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %412, i64 %413, i1 true) #6, !dbg !53 + %415 = extractvalue { i32, i32 } %414, 0, !dbg !53 + %416 = bitcast i32 %415 to <2 x bfloat>, !dbg !53 + %417 = extractvalue { i32, i32 } %414, 1, !dbg !53 + %418 = bitcast i32 %417 to <2 x bfloat>, !dbg !53 + %419 = getelementptr bfloat, ptr addrspace(1) %3, i64 %404, !dbg !54 + %420 = fpext <2 x bfloat> %401 to <2 x float>, !dbg !55 + %421 = fpext <2 x bfloat> %409 to <2 x float>, !dbg !56 + %422 = fpext <2 x bfloat> %416 to <2 x float>, !dbg !57 + %423 = fadd <2 x float> %420, splat (float 1.000000e+00), !dbg !58 + %424 = fsub <2 x float> %421, %377, !dbg !59 + %425 = fmul <2 x float> %380, %424, !dbg !60 + %426 = fmul <2 x float> %423, %425, !dbg !61 + %427 = fadd <2 x float> %426, %422, !dbg !62 + %428 = fptrunc <2 x float> %427 to <2 x bfloat>, !dbg !63 + %429 = fpext <2 x bfloat> %403 to <2 x float>, !dbg !55 + %430 = fpext <2 x bfloat> %411 to <2 x float>, !dbg !56 + %431 = fpext <2 x bfloat> %418 to <2 x float>, !dbg !57 + %432 = fadd <2 x float> %429, splat (float 1.000000e+00), !dbg !58 + %433 = fsub <2 x float> %430, %377, !dbg !59 + %434 = fmul <2 x float> %380, %433, !dbg !60 + %435 = fmul <2 x float> %432, %434, !dbg !61 + %436 = fadd <2 x float> %435, %431, !dbg !62 + %437 = fptrunc <2 x float> %436 to <2 x bfloat>, !dbg !63 + %438 = bitcast <2 x bfloat> %428 to i32, !dbg !63 + %439 = bitcast <2 x bfloat> %437 to i32, !dbg !63 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %438, i32 %439, ptr addrspace(1) %419, i1 %9) #6, !dbg !63 + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_1", linkageName: "triton_red_fused_add_mul_native_layer_norm_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 25, column: 21, scope: !5) +!10 = !DILocation(line: 26, column: 37, scope: !5) +!11 = !DILocation(line: 38, column: 46, scope: !5) +!12 = !DILocation(line: 32, column: 43, scope: !5) +!13 = !DILocation(line: 38, column: 34, scope: !5) +!14 = !DILocation(line: 38, column: 51, scope: !5) +!15 = !DILocation(line: 38, column: 112, scope: !5) +!16 = !DILocation(line: 44, column: 62, scope: !5) +!17 = !DILocation(line: 222, column: 24, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !5, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 42, column: 51, scope: !21) +!21 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!22 = !DILocation(line: 46, column: 66, scope: !5) +!23 = !DILocation(line: 224, column: 34, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 224, column: 26, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 225, column: 39, scope: !18, inlinedAt: !20) +!26 = !DILocation(line: 225, column: 31, scope: !18, inlinedAt: !20) +!27 = !DILocation(line: 225, column: 22, scope: !18, inlinedAt: !20) +!28 = !DILocation(line: 45, column: 58, scope: !5) +!29 = !DILocation(line: 231, column: 21, scope: !18, inlinedAt: !30) +!30 = !DILocation(line: 243, column: 46, scope: !18, inlinedAt: !31) +!31 = !DILocation(line: 47, column: 79, scope: !21) +!32 = !DILocation(line: 232, column: 28, scope: !18, inlinedAt: !30) +!33 = !DILocation(line: 233, column: 39, scope: !18, inlinedAt: !30) +!34 = !DILocation(line: 233, column: 60, scope: !18, inlinedAt: !30) +!35 = !DILocation(line: 233, column: 49, scope: !18, inlinedAt: !30) +!36 = !DILocation(line: 235, column: 25, scope: !18, inlinedAt: !30) +!37 = !DILocation(line: 235, column: 17, scope: !18, inlinedAt: !30) +!38 = !DILocation(line: 236, column: 15, scope: !18, inlinedAt: !30) +!39 = !DILocation(line: 236, column: 30, scope: !18, inlinedAt: !30) +!40 = !DILocation(line: 236, column: 38, scope: !18, inlinedAt: !30) +!41 = !DILocation(line: 236, column: 49, scope: !18, inlinedAt: !30) +!42 = !DILocation(line: 236, column: 22, scope: !18, inlinedAt: !30) +!43 = !DILocation(line: 65, column: 24, scope: !5) +!44 = !DILocation(line: 67, column: 24, scope: !5) +!45 = !DILocation(line: 68, column: 32, scope: !5) +!46 = !DILocation(line: 51, column: 43, scope: !5) +!47 = !DILocation(line: 57, column: 34, scope: !5) +!48 = !DILocation(line: 57, column: 41, scope: !5) +!49 = !DILocation(line: 58, column: 42, scope: !5) +!50 = !DILocation(line: 58, column: 35, scope: !5) +!51 = !DILocation(line: 58, column: 52, scope: !5) +!52 = !DILocation(line: 59, column: 35, scope: !5) +!53 = !DILocation(line: 59, column: 42, scope: !5) +!54 = !DILocation(line: 73, column: 29, scope: !5) +!55 = !DILocation(line: 57, column: 94, scope: !5) +!56 = !DILocation(line: 58, column: 114, scope: !5) +!57 = !DILocation(line: 59, column: 95, scope: !5) +!58 = !DILocation(line: 61, column: 23, scope: !5) +!59 = !DILocation(line: 63, column: 24, scope: !5) +!60 = !DILocation(line: 69, column: 24, scope: !5) +!61 = !DILocation(line: 71, column: 24, scope: !5) +!62 = !DILocation(line: 72, column: 24, scope: !5) +!63 = !DILocation(line: 73, column: 53, scope: !5) +!64 = !DILocation(line: 52, column: 31, scope: !5) +!65 = !DILocation(line: 51, column: 4, scope: !5) diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ptx b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..6c2edda7459d25df6b9e0e416bf2f6dea1092073 --- /dev/null +++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ptx @@ -0,0 +1,1089 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused_add_mul_native_layer_norm_1 // -- Begin function triton_red_fused_add_mul_native_layer_norm_1 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused_add_mul_native_layer_norm_1 +.visible .entry triton_red_fused_add_mul_native_layer_norm_1( + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_3, + .param .u32 triton_red_fused_add_mul_native_layer_norm_1_param_4, + .param .u32 triton_red_fused_add_mul_native_layer_norm_1_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_7 +) +.reqntid 512 +{ + .reg .pred %p<19>; + .reg .b16 %rs<33>; + .reg .b32 %r<282>; + .reg .b64 %rd<28>; + .loc 1 18 0 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd19, [triton_red_fused_add_mul_native_layer_norm_1_param_0]; + ld.param.b64 %rd20, [triton_red_fused_add_mul_native_layer_norm_1_param_1]; +$L__tmp0: + .loc 1 23 28 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:23:28 + mov.u32 %r37, %ctaid.x; + .loc 1 25 21 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:25:21 + setp.lt.u32 %p1, %r37, 256; + ld.param.b64 %rd21, [triton_red_fused_add_mul_native_layer_norm_1_param_2]; + ld.param.b64 %rd22, [triton_red_fused_add_mul_native_layer_norm_1_param_3]; + .loc 1 26 37 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:26:37 + mov.u32 %r38, %tid.x; + shl.b32 %r39, %r38, 2; + and.b32 %r40, %r39, 2044; + .loc 1 38 46 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:46 + shl.b32 %r41, %r37, 12; + or.b32 %r42, %r40, %r41; + .loc 1 38 34 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:34 + mad.wide.s32 %rd1, %r42, 2, %rd19; + .loc 1 38 51 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:51 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r2; + .loc 1 38 112 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112 + cvt.f32.bf16 %r43, %rs2; + cvt.f32.bf16 %r44, %rs1; + .loc 1 38 51 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:51 + mov.b32 {%rs3, %rs4}, %r1; + .loc 1 38 112 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112 + cvt.f32.bf16 %r45, %rs4; + cvt.f32.bf16 %r46, %rs3; + .loc 1 44 62 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:44:62 + selp.f32 %r47, %r46, 0f00000000, %p1; + selp.f32 %r48, %r45, 0f00000000, %p1; + selp.f32 %r49, %r44, 0f00000000, %p1; + selp.f32 %r50, %r43, 0f00000000, %p1; + .loc 1 38 34 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:34 + add.s64 %rd3, %rd1, 4096; + .loc 1 38 51 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:51 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4, %r3; + mov.u32 %r5, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4; + // end inline asm + mov.b32 {%rs5, %rs6}, %r4; + .loc 1 38 112 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112 + cvt.f32.bf16 %r51, %rs5; +$L__tmp1: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + sub.f32 %r52, %r51, %r47; +$L__tmp2: + .loc 1 46 66 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:46:66 + selp.f32 %r53, 0f40000000, 0f3F800000, %p1; +$L__tmp3: + .loc 2 224 34 // triton_helpers.py:224:34 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + div.full.f32 %r54, %r52, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + add.f32 %r55, %r47, %r54; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + sub.f32 %r56, %r51, %r55; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + fma.rn.f32 %r57, %r52, %r56, 0f00000000; +$L__tmp4: + .loc 1 38 112 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112 + cvt.f32.bf16 %r58, %rs6; +$L__tmp5: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + sub.f32 %r59, %r58, %r48; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + div.full.f32 %r60, %r59, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + add.f32 %r61, %r48, %r60; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + sub.f32 %r62, %r58, %r61; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + fma.rn.f32 %r63, %r59, %r62, 0f00000000; +$L__tmp6: + .loc 1 38 51 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:51 + mov.b32 {%rs7, %rs8}, %r5; + .loc 1 38 112 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112 + cvt.f32.bf16 %r64, %rs7; +$L__tmp7: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + sub.f32 %r65, %r64, %r49; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + div.full.f32 %r66, %r65, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + add.f32 %r67, %r49, %r66; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + sub.f32 %r68, %r64, %r67; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + fma.rn.f32 %r69, %r65, %r68, 0f00000000; +$L__tmp8: + .loc 1 38 112 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112 + cvt.f32.bf16 %r70, %rs8; +$L__tmp9: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + sub.f32 %r71, %r70, %r50; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + div.full.f32 %r72, %r71, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + add.f32 %r73, %r50, %r72; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + sub.f32 %r74, %r70, %r73; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ] + fma.rn.f32 %r75, %r71, %r74, 0f00000000; +$L__tmp10: + .loc 1 44 62 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:44:62 + selp.f32 %r76, %r55, 0f00000000, %p1; + selp.f32 %r77, %r61, 0f00000000, %p1; + selp.f32 %r78, %r67, 0f00000000, %p1; + selp.f32 %r79, %r73, 0f00000000, %p1; + .loc 1 45 58 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:45:58 + selp.f32 %r80, %r69, 0f00000000, %p1; + selp.f32 %r81, %r75, 0f00000000, %p1; + .loc 1 46 66 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:46:66 + selp.f32 %r82, 0f40000000, 0f00000000, %p1; + .loc 1 26 37 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:26:37 + and.b32 %r83, %r38, 511; + and.b32 %r84, %r38, 31; +$L__tmp11: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r85, %r77, %r76; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r86, 0f40800000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p6, %r86, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r87, %r82, %r86; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r88, 0f00000000, %r87, %p6; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r89, %r85, %r88, %r76; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r90, %r57, %r63; + selp.f32 %r91, %r90, 0f00000000, %p1; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r92, %r85, %r85; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r93, %r92, %r82; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r94, %r93, %r88, %r91; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r95, %r78, %r89; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r96, 0f40C00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p7, %r96, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r97, %r82, %r96; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r98, 0f00000000, %r97, %p7; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r99, %r98, %r95, %r89; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r100, %r80, %r94; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r101, %r95, %r95; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r102, %r86, %r101; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r103, %r98, %r102, %r100; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r104, %r79, %r99; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r105, 0f41000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p8, %r105, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r106, %r82, %r105; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r107, 0f00000000, %r106, %p8; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r108, %r107, %r104, %r99; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r109, %r81, %r103; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r110, %r104, %r104; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r111, %r96, %r110; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r112, %r107, %r111, %r109; +$L__tmp12: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r113, %r108, 16, 31, -1; + shfl.sync.bfly.b32 %r114, %r112, 16, 31, -1; + shfl.sync.bfly.b32 %r115, %r105, 16, 31, -1; +$L__tmp13: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r116, %r113, %r108; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r117, %r105, %r115; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p9, %r117, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r118, %r115, %r117; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r119, 0f00000000, %r118, %p9; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r120, %r119, %r116, %r108; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r121, %r112, %r114; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r122, %r116, %r116; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r123, %r105, %r122; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r124, %r119, %r123, %r121; +$L__tmp14: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r125, %r120, 8, 31, -1; + shfl.sync.bfly.b32 %r126, %r124, 8, 31, -1; + shfl.sync.bfly.b32 %r127, %r117, 8, 31, -1; +$L__tmp15: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r128, %r125, %r120; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r129, %r117, %r127; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p10, %r129, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r130, %r127, %r129; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r131, 0f00000000, %r130, %p10; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r132, %r128, %r131, %r120; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r133, %r124, %r126; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r134, %r128, %r128; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r135, %r117, %r134; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r136, %r131, %r135, %r133; +$L__tmp16: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r137, %r132, 4, 31, -1; + shfl.sync.bfly.b32 %r138, %r136, 4, 31, -1; + shfl.sync.bfly.b32 %r139, %r129, 4, 31, -1; +$L__tmp17: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r140, %r137, %r132; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r141, %r129, %r139; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p11, %r141, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r142, %r139, %r141; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r143, 0f00000000, %r142, %p11; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r144, %r140, %r143, %r132; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r145, %r136, %r138; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r146, %r140, %r140; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r147, %r129, %r146; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r148, %r143, %r147, %r145; +$L__tmp18: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r149, %r144, 2, 31, -1; + shfl.sync.bfly.b32 %r150, %r148, 2, 31, -1; + shfl.sync.bfly.b32 %r151, %r141, 2, 31, -1; +$L__tmp19: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r152, %r149, %r144; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r153, %r141, %r151; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p12, %r153, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r154, %r151, %r153; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r155, 0f00000000, %r154, %p12; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r156, %r152, %r155, %r144; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r157, %r148, %r150; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r158, %r152, %r152; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r159, %r141, %r158; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r160, %r155, %r159, %r157; +$L__tmp20: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r161, %r156, 1, 31, -1; + shfl.sync.bfly.b32 %r162, %r160, 1, 31, -1; + shfl.sync.bfly.b32 %r163, %r153, 1, 31, -1; +$L__tmp21: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r164, %r161, %r156; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r11, %r153, %r163; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p13, %r11, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r165, %r163, %r11; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r166, 0f00000000, %r165, %p13; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r7, %r164, %r166, %r156; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r167, %r160, %r162; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r168, %r164, %r164; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r169, %r153, %r168; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r9, %r166, %r169, %r167; +$L__tmp22: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + setp.eq.b32 %p2, %r84, 0; + shr.u32 %r170, %r38, 3; + and.b32 %r171, %r170, 60; + mov.b32 %r172, global_smem; + add.s32 %r6, %r172, %r171; + // begin inline asm + @%p2 st.shared.b32 [ %r6 + 0 ], %r7; + // end inline asm + add.s32 %r8, %r6, 64; + // begin inline asm + @%p2 st.shared.b32 [ %r8 + 0 ], %r9; + // end inline asm + add.s32 %r10, %r6, 128; + // begin inline asm + @%p2 st.shared.b32 [ %r10 + 0 ], %r11; + // end inline asm + bar.sync 0; + setp.lt.u32 %p3, %r83, 16; + shl.b32 %r173, %r83, 2; + add.s32 %r13, %r172, %r173; + // begin inline asm + @%p3 ld.shared.b32 %r12, [ %r13 + 0 ]; + // end inline asm + add.s32 %r15, %r13, 64; + // begin inline asm + @%p3 ld.shared.b32 %r14, [ %r15 + 0 ]; + // end inline asm + add.s32 %r17, %r13, 128; + // begin inline asm + @%p3 ld.shared.b32 %r16, [ %r17 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r174, %r12, 8, 31, -1; + shfl.sync.bfly.b32 %r175, %r14, 8, 31, -1; + shfl.sync.bfly.b32 %r176, %r16, 8, 31, -1; +$L__tmp23: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r177, %r174, %r12; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r178, %r16, %r176; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p14, %r178, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r179, %r176, %r178; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r180, 0f00000000, %r179, %p14; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r181, %r177, %r180, %r12; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r182, %r14, %r175; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r183, %r177, %r177; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r184, %r183, %r16; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r185, %r184, %r180, %r182; +$L__tmp24: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r186, %r181, 4, 31, -1; + shfl.sync.bfly.b32 %r187, %r185, 4, 31, -1; + shfl.sync.bfly.b32 %r188, %r178, 4, 31, -1; +$L__tmp25: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r189, %r186, %r181; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r190, %r178, %r188; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p15, %r190, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r191, %r188, %r190; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r192, 0f00000000, %r191, %p15; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r193, %r189, %r192, %r181; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r194, %r185, %r187; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r195, %r189, %r189; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r196, %r178, %r195; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r197, %r192, %r196, %r194; +$L__tmp26: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r198, %r193, 2, 31, -1; + shfl.sync.bfly.b32 %r199, %r197, 2, 31, -1; + shfl.sync.bfly.b32 %r200, %r190, 2, 31, -1; +$L__tmp27: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r201, %r198, %r193; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r202, %r190, %r200; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p16, %r202, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r203, %r200, %r202; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r204, 0f00000000, %r203, %p16; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r205, %r201, %r204, %r193; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r206, %r197, %r199; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r207, %r201, %r201; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r208, %r190, %r207; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r209, %r204, %r208, %r206; +$L__tmp28: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r210, %r205, 1, 31, -1; + shfl.sync.bfly.b32 %r211, %r209, 1, 31, -1; + shfl.sync.bfly.b32 %r212, %r202, 1, 31, -1; +$L__tmp29: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r213, %r210, %r205; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r20, %r202, %r212; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p17, %r20, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r214, %r212, %r20; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r215, 0f00000000, %r214, %p17; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r18, %r213, %r215, %r205; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r216, %r209, %r211; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r217, %r213, %r213; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r218, %r202, %r217; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r19, %r215, %r218, %r216; +$L__tmp30: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + and.b32 %r219, %r38, 15; + setp.eq.b32 %p18, %r219, 0; + and.pred %p4, %p3, %p18; + // begin inline asm + @%p4 st.shared.b32 [ %r13 + 0 ], %r18; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r15 + 0 ], %r19; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r17 + 0 ], %r20; + // end inline asm + bar.sync 0; + ld.shared.b32 %r220, [global_smem]; + ld.shared.b32 %r221, [global_smem+64]; + mov.b32 %r222, 0f45800000; +$L__tmp31: + .loc 1 65 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:65:24 + div.full.f32 %r223, %r221, %r222; + .loc 1 67 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:67:24 + add.f32 %r224, %r223, 0f358637BD; + .loc 1 68 32 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:68:32 + rsqrt.approx.ftz.f32 %r225, %r224; + .loc 1 51 43 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:51:43 + cvt.u64.u32 %rd23, %r40; + cvt.s64.s32 %rd24, %r41; + .loc 1 57 34 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:34 + mul.wide.u32 %rd25, %r40, 2; + add.s64 %rd5, %rd20, %rd25; + .loc 1 57 41 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:41 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + mov.pred %p5, -1; + // begin inline asm + mov.u32 %r21, %r3; + mov.u32 %r22, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r21, %r22 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 58 42 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:42 + or.b64 %rd26, %rd23, %rd24; + .loc 1 58 35 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:35 + shl.b64 %rd27, %rd26, 1; + add.s64 %rd7, %rd19, %rd27; + .loc 1 58 52 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:52 + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r23, %r3; + mov.u32 %r24, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r23, %r24 }, [ %rd7 + 0 ], %rd8; + // end inline asm + .loc 1 59 35 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:35 + add.s64 %rd9, %rd21, %rd25; + .loc 1 59 42 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:42 + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r3; + mov.u32 %r26, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r25, %r26 }, [ %rd9 + 0 ], %rd10; + // end inline asm + .loc 1 73 29 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:29 + add.s64 %rd11, %rd22, %rd27; + .loc 1 57 94 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94 + mov.b32 {%rs9, %rs10}, %r21; + cvt.f32.bf16 %r226, %rs9; + cvt.f32.bf16 %r227, %rs10; + .loc 1 58 114 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114 + mov.b32 {%rs11, %rs12}, %r23; + cvt.f32.bf16 %r228, %rs12; + cvt.f32.bf16 %r229, %rs11; + .loc 1 59 95 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95 + mov.b32 {%rs13, %rs14}, %r25; + cvt.f32.bf16 %r230, %rs14; + cvt.f32.bf16 %r231, %rs13; + .loc 1 61 23 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23 + add.f32 %r232, %r227, 0f3F800000; + add.f32 %r233, %r226, 0f3F800000; + .loc 1 63 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24 + sub.f32 %r234, %r229, %r220; + sub.f32 %r235, %r228, %r220; + .loc 1 69 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24 + mul.f32 %r236, %r225, %r235; + mul.f32 %r237, %r225, %r234; + .loc 1 72 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24 + fma.rn.f32 %r238, %r233, %r237, %r231; + fma.rn.f32 %r239, %r232, %r236, %r230; + .loc 1 73 53 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53 + cvt.rn.bf16x2.f32 %r27, %r239, %r238; + .loc 1 57 94 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94 + mov.b32 {%rs15, %rs16}, %r22; + cvt.f32.bf16 %r240, %rs15; + cvt.f32.bf16 %r241, %rs16; + .loc 1 58 114 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114 + mov.b32 {%rs17, %rs18}, %r24; + cvt.f32.bf16 %r242, %rs18; + cvt.f32.bf16 %r243, %rs17; + .loc 1 59 95 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95 + mov.b32 {%rs19, %rs20}, %r26; + cvt.f32.bf16 %r244, %rs20; + cvt.f32.bf16 %r245, %rs19; + .loc 1 61 23 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23 + add.f32 %r246, %r241, 0f3F800000; + add.f32 %r247, %r240, 0f3F800000; + .loc 1 63 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24 + sub.f32 %r248, %r243, %r220; + sub.f32 %r249, %r242, %r220; + .loc 1 69 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24 + mul.f32 %r250, %r225, %r249; + mul.f32 %r251, %r225, %r248; + .loc 1 72 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24 + fma.rn.f32 %r252, %r247, %r251, %r245; + fma.rn.f32 %r253, %r246, %r250, %r244; + .loc 1 73 53 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53 + cvt.rn.bf16x2.f32 %r28, %r253, %r252; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd11 + 0 ], { %r27, %r28 }; + // end inline asm + .loc 1 57 34 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:34 + add.s64 %rd12, %rd5, 4096; + .loc 1 57 41 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:41 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r29, %r3; + mov.u32 %r30, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r29, %r30 }, [ %rd12 + 0 ], %rd13; + // end inline asm + .loc 1 58 35 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:35 + add.s64 %rd14, %rd7, 4096; + .loc 1 58 52 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:52 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r31, %r3; + mov.u32 %r32, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r31, %r32 }, [ %rd14 + 0 ], %rd15; + // end inline asm + .loc 1 59 35 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:35 + add.s64 %rd16, %rd9, 4096; + .loc 1 59 42 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:42 + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r33, %r3; + mov.u32 %r34, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd16 + 0 ], %rd17; + // end inline asm + .loc 1 73 29 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:29 + add.s64 %rd18, %rd11, 4096; + .loc 1 57 94 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94 + mov.b32 {%rs21, %rs22}, %r29; + cvt.f32.bf16 %r254, %rs21; + cvt.f32.bf16 %r255, %rs22; + .loc 1 58 114 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114 + mov.b32 {%rs23, %rs24}, %r31; + cvt.f32.bf16 %r256, %rs24; + cvt.f32.bf16 %r257, %rs23; + .loc 1 59 95 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95 + mov.b32 {%rs25, %rs26}, %r33; + cvt.f32.bf16 %r258, %rs26; + cvt.f32.bf16 %r259, %rs25; + .loc 1 61 23 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23 + add.f32 %r260, %r255, 0f3F800000; + add.f32 %r261, %r254, 0f3F800000; + .loc 1 63 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24 + sub.f32 %r262, %r257, %r220; + sub.f32 %r263, %r256, %r220; + .loc 1 69 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24 + mul.f32 %r264, %r225, %r263; + mul.f32 %r265, %r225, %r262; + .loc 1 72 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24 + fma.rn.f32 %r266, %r261, %r265, %r259; + fma.rn.f32 %r267, %r260, %r264, %r258; + .loc 1 73 53 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53 + cvt.rn.bf16x2.f32 %r35, %r267, %r266; + .loc 1 57 94 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94 + mov.b32 {%rs27, %rs28}, %r30; + cvt.f32.bf16 %r268, %rs27; + cvt.f32.bf16 %r269, %rs28; + .loc 1 58 114 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114 + mov.b32 {%rs29, %rs30}, %r32; + cvt.f32.bf16 %r270, %rs30; + cvt.f32.bf16 %r271, %rs29; + .loc 1 59 95 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95 + mov.b32 {%rs31, %rs32}, %r34; + cvt.f32.bf16 %r272, %rs32; + cvt.f32.bf16 %r273, %rs31; + .loc 1 61 23 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23 + add.f32 %r274, %r269, 0f3F800000; + add.f32 %r275, %r268, 0f3F800000; + .loc 1 63 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24 + sub.f32 %r276, %r271, %r220; + sub.f32 %r277, %r270, %r220; + .loc 1 69 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24 + mul.f32 %r278, %r225, %r277; + mul.f32 %r279, %r225, %r276; + .loc 1 72 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24 + fma.rn.f32 %r280, %r275, %r279, %r273; + fma.rn.f32 %r281, %r274, %r278, %r272; + .loc 1 73 53 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53 + cvt.rn.bf16x2.f32 %r36, %r281, %r280; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd18 + 0 ], { %r35, %r36 }; + // end inline asm + .loc 1 51 4 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:51:4 + ret; +$L__tmp32: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 367 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x168 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 97 +.b8 118 +.b8 111 +.b8 97 +.b8 122 +.b8 54 +.b8 101 +.b8 55 +.b8 107 +.b8 98 +.b8 107 +.b8 53 +.b8 119 +.b8 113 +.b8 50 +.b8 110 +.b8 55 +.b8 118 +.b8 122 +.b8 54 +.b8 114 +.b8 120 +.b8 104 +.b8 99 +.b8 114 +.b8 119 +.b8 100 +.b8 117 +.b8 50 +.b8 116 +.b8 114 +.b8 97 +.b8 122 +.b8 101 +.b8 120 +.b8 117 +.b8 98 +.b8 100 +.b8 113 +.b8 53 +.b8 113 +.b8 119 +.b8 121 +.b8 118 +.b8 50 +.b8 97 +.b8 106 +.b8 109 +.b8 98 +.b8 107 +.b8 122 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 97 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 97 +.b8 116 +.b8 105 +.b8 118 +.b8 101 +.b8 95 +.b8 108 +.b8 97 +.b8 121 +.b8 101 +.b8 114 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x113:0x5f DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x128:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 51 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x140:0x31 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp11 // DW_AT_low_pc +.b64 $L__tmp31 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 47 // DW_AT_call_line +.b8 79 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x158:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp11 // DW_AT_low_pc +.b64 $L__tmp30 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 243 // DW_AT_call_line +.b8 46 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.source b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.source new file mode 100644 index 0000000000000000000000000000000000000000..d1abbaf09e8deef29466e387f45a33e0f7e1eb3b --- /dev/null +++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.source @@ -0,0 +1,420 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0) +#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0) +#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0) +#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0) +#loc91 = loc(unknown) +#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0) +#loc109 = loc("in_ptr0"(#loc)) +#loc110 = loc("in_ptr1"(#loc)) +#loc111 = loc("in_ptr2"(#loc)) +#loc112 = loc("out_ptr2"(#loc)) +#loc113 = loc("xnumel"(#loc)) +#loc114 = loc("r0_numel"(#loc)) +#loc171 = loc("value"(#loc72)) +#loc172 = loc("mean"(#loc72)) +#loc173 = loc("m2"(#loc72)) +#loc174 = loc("weight"(#loc72)) +#loc175 = loc("first_iteration"(#loc72)) +#loc185 = loc("input"(#loc85)) +#loc186 = loc("mean"(#loc89)) +#loc187 = loc("m2"(#loc89)) +#loc188 = loc("weight"(#loc89)) +#loc189 = loc("mean_1"(#loc94)) +#loc190 = loc("m2_1"(#loc94)) +#loc191 = loc("weight_1"(#loc94)) +#loc192 = loc("mean_2"(#loc94)) +#loc193 = loc("m2_2"(#loc94)) +#loc194 = loc("weight_2"(#loc94)) +#loc201 = loc("new_mean"(#loc171)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 256 : i32 loc(#loc115) + %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116) + %xoffset = tt.get_program_id x : i32 loc(#loc117) + %xoffset_2 = arith.constant 1 : i32 loc(#loc118) + %xoffset_3 = arith.constant 1 : i32 loc(#loc118) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121) + %xmask = arith.constant dense<256> : tensor<1x1xi32> loc(#loc122) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc123) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc124) + %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc125) + %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc126) + %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc127) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc129) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc129) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc130) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc130) + %tmp0 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_18 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc132) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc132) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc133) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc133) + %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc134) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc134) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc135) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc135) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc135) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc136) + %c0_i32_32 = arith.constant 0 : i32 loc(#loc23) + %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23) + %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, i1) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc24) + %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc137) + %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x2048xi1> loc(#loc137) + %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc138) + %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc139) + %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x2048xi1> loc(#loc139) + %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc140) + %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc141) + %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x2048xi1> loc(#loc141) + %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc142) + scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc31) + } loc(#loc207) + %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143) + %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144) + %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc36) + %c2048_i32_12 = arith.constant 2048 : i32 loc(#loc36) + %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36) + %7 = arith.bitcast %c2048_i32_12 : i32 to i32 loc(#loc36) + %8 = ub.poison : i32 loc(#loc36) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc146) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc146) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc147) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc147) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc148) + %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc148) + %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149) + %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc149) + %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc149) + %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc149) + %tmp9_20 = arith.extf %tmp9_19 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc150) + %tmp12 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_21 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151) + %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151) + %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc152) + %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x2048xi32> loc(#loc152) + %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc153) + %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc153) + %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc154) + %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x2048xi1> loc(#loc154) + %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc155) + %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc155) + %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc155) + %tmp12_34 = arith.extf %tmp12_33 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc156) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc157) + %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc157) + %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc158) + %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc158) + %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc158) + %tmp23_40 = arith.extf %tmp23_39 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc159) + %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160) + %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc161) + %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x2048xf32> loc(#loc161) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc162) + %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x2048xf32> loc(#loc162) + %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163) + %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164) + %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164) + %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165) + %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166) + %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166) + %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc168) + %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x2048xf32> loc(#loc168) + %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x2048xf32> loc(#loc169) + %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x2048xf32> loc(#loc170) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc62) + %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc63) + %11 = arith.addi %r0_index_13, %10 : tensor<1x2048xi32> loc(#loc63) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc64) + %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc64) + %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc65) + %15 = arith.andi %r0_mask_14, %14 : tensor<1x2048xi1> loc(#loc65) + %16 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc66) + tt.store %13, %16, %15 : tensor<1x2048x!tt.ptr> loc(#loc66) + } loc(#loc36) + tt.return loc(#loc67) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc69) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc69) + tt.return %cst_0 : tensor<1x2048xf32> loc(#loc70) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc71) + tt.return %0 : tensor<1x2048xf32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%new_mean: tensor<1x2048xf32> loc("new_mean"(#loc171)), %mean: tensor<1x2048xf32> loc("mean"(#loc72)), %m2: tensor<1x2048xf32> loc("m2"(#loc72)), %weight: tensor<1x2048xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} { + %0:3 = scf.if %first_iteration -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) { + %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176) + %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc202) + %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_2048S__(%m2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc203) + scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc203) + } else { + %delta = arith.subf %new_mean, %mean : tensor<1x2048xf32> loc(#loc178) + %new_weight = arith.constant 1 : i32 loc(#loc179) + %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179) + %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc179) + %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x2048xf32> loc(#loc204) + %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x2048xf32> loc(#loc180) + %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x2048xf32> loc(#loc205) + %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x2048xf32> loc(#loc182) + %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc183) + %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x2048xf32> loc(#loc206) + scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc184) + } loc(#loc73) + tt.return %0#1, %0#0, %0#2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc84) + %2 = ub.poison : tensor<1x2048xf32> loc(#loc84) + %3 = ub.poison : tensor<1x2048xf32> loc(#loc84) + tt.return %1, %2, %3 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc84) + } loc(#loc72) + tt.func private @triton.language.standard.zeros_like__fp32S1_2048S__(%input: tensor<1x2048xf32> loc("input"(#loc85))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc86) + tt.return %0 : tensor<1x2048xf32> loc(#loc87) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc88) + tt.return %1 : tensor<1x2048xf32> loc(#loc88) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%mean: tensor<1x2048xf32> loc("mean"(#loc89)), %m2: tensor<1x2048xf32> loc("m2"(#loc89)), %weight: tensor<1x2048xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({ + ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): + %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90) + tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90) + }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90) + tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc93) + %2 = ub.poison : tensor<1xf32> loc(#loc93) + %3 = ub.poison : tensor<1xf32> loc(#loc93) + tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93) + } loc(#loc89) + tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} { + %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195) + %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196) + %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197) + %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197) + %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198) + %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199) + %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100) + %1 = arith.addf %mean_1, %0 : f32 loc(#loc101) + %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102) + %3 = arith.mulf %delta, %delta : f32 loc(#loc103) + %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104) + %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105) + %6 = arith.addf %2, %5 : f32 loc(#loc106) + tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107) + ^bb1: // no predecessors + %7 = ub.poison : f32 loc(#loc108) + %8 = ub.poison : f32 loc(#loc108) + %9 = ub.poison : f32 loc(#loc108) + tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108) + } loc(#loc94) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":29:45) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":30:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":31:47) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:62) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:51) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:37) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:58) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:41) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:8) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":50:16) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:43) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":52:31) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":53:29) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:47) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:42) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:62) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":60:16) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":64:16) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":66:16) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:41) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:36) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:63) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4) +#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0) +#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31) +#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11) +#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4) +#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46) +#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11) +#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4) +#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30) +#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11) +#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4) +#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11) +#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4) +#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11) +#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4) +#loc115 = loc("xnumel"(#loc1)) +#loc116 = loc("r0_numel"(#loc2)) +#loc117 = loc("xoffset"(#loc3)) +#loc118 = loc("xoffset"(#loc4)) +#loc119 = loc("xindex"(#loc5)) +#loc120 = loc("xindex"(#loc6)) +#loc121 = loc("xindex"(#loc7)) +#loc122 = loc("xmask"(#loc8)) +#loc123 = loc("r0_base"(#loc9)) +#loc124 = loc("r0_base"(#loc10)) +#loc125 = loc("tmp3_mean"(#loc11)) +#loc126 = loc("tmp3_m2"(#loc12)) +#loc127 = loc("tmp3_weight"(#loc13)) +#loc128 = loc("tmp3_mean"(#loc14)) +#loc129 = loc("r0_index"(#loc15)) +#loc130 = loc("r0_mask"(#loc16)) +#loc131 = loc("tmp0"(#loc17)) +#loc132 = loc("tmp0"(#loc18)) +#loc133 = loc("tmp0"(#loc19)) +#loc134 = loc("tmp0"(#loc20)) +#loc135 = loc("tmp0"(#loc21)) +#loc136 = loc("tmp0"(#loc22)) +#loc137 = loc("tmp3_mean"(#loc25)) +#loc138 = loc("tmp3_mean"(#loc26)) +#loc139 = loc("tmp3_m2"(#loc27)) +#loc140 = loc("tmp3_m2"(#loc28)) +#loc141 = loc("tmp3_weight"(#loc29)) +#loc142 = loc("tmp3_weight"(#loc30)) +#loc143 = loc("tmp3"(#loc33)) +#loc144 = loc("tmp7"(#loc34)) +#loc145 = loc("tmp8"(#loc35)) +#loc146 = loc("r0_index"(#loc37)) +#loc147 = loc("r0_mask"(#loc38)) +#loc148 = loc("tmp9"(#loc39)) +#loc149 = loc("tmp9"(#loc40)) +#loc150 = loc("tmp9"(#loc41)) +#loc151 = loc("tmp12"(#loc42)) +#loc152 = loc("tmp12"(#loc43)) +#loc153 = loc("tmp12"(#loc44)) +#loc154 = loc("tmp12"(#loc45)) +#loc155 = loc("tmp12"(#loc46)) +#loc156 = loc("tmp12"(#loc47)) +#loc157 = loc("tmp23"(#loc48)) +#loc158 = loc("tmp23"(#loc49)) +#loc159 = loc("tmp23"(#loc50)) +#loc160 = loc("tmp10"(#loc51)) +#loc161 = loc("tmp11"(#loc52)) +#loc162 = loc("tmp14"(#loc53)) +#loc163 = loc("tmp15"(#loc54)) +#loc164 = loc("tmp16"(#loc55)) +#loc165 = loc("tmp17"(#loc56)) +#loc166 = loc("tmp18"(#loc57)) +#loc167 = loc("tmp19"(#loc58)) +#loc168 = loc("tmp20"(#loc59)) +#loc169 = loc("tmp22"(#loc60)) +#loc170 = loc("tmp24"(#loc61)) +#loc176 = loc("new_weight"(#loc74)) +#loc177 = loc("new_m2"(#loc75)) +#loc178 = loc("delta"(#loc76)) +#loc179 = loc("new_weight"(#loc77)) +#loc180 = loc("new_mean"(#loc78)) +#loc181 = loc("new_mean"(#loc79)) +#loc182 = loc("new_m2"(#loc80)) +#loc183 = loc("new_m2"(#loc81)) +#loc184 = loc("new_m2"(#loc82)) +#loc195 = loc("delta"(#loc95)) +#loc196 = loc("new_weight"(#loc96)) +#loc197 = loc("w2_over_w"(#loc97)) +#loc198 = loc("w2_over_w"(#loc98)) +#loc199 = loc("w2_over_w"(#loc99)) +#loc200 = loc("tmp3_m2"(#loc128)) +#loc202 = loc("new_weight"(#loc176)) +#loc203 = loc("new_m2"(#loc177)) +#loc204 = loc("new_weight"(#loc179)) +#loc205 = loc("new_mean"(#loc181)) +#loc206 = loc("new_m2"(#loc184)) +#loc207 = loc("tmp3_weight"(#loc200)) diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttgir b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..e83df533d7c29132d181b171d0529d27df449610 --- /dev/null +++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttgir @@ -0,0 +1,261 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("out_ptr2"(#loc)) +#loc74 = loc("xnumel"(#loc)) +#loc75 = loc("r0_numel"(#loc)) +#loc101 = loc(callsite(#loc1 at #loc30)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1x2048xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xmask = arith.cmpi slt, %xoffset, %c256_i32 : i32 loc(#loc77) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc78) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc78) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc79) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc130) + %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc81) + %tmp0_9 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc131) + %tmp3_weight:3 = scf.for %tmp3_weight_10 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg7 = %cst_2, %arg8 = %cst_2, %arg9 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) : i32 { + %r0_index = tt.splat %tmp3_weight_10 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc84) + %r0_index_11 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc84) + %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc85) + %tmp0_12 = arith.addi %r0_index_11, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc80) + %tmp0_13 = tt.addptr %tmp0_8, %tmp0_12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc81) + %tmp0_14 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc82) + %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc86) + %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc87) + %2 = arith.cmpi eq, %tmp3_weight_10, %c0_i32 : i32 loc(#loc14) + %3:3 = scf.if %2 -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) { + scf.yield %cst_2, %tmp0_16, %cst_5 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc155) + } else { + %delta = arith.subf %tmp0_16, %arg7 : tensor<1x2048xf32, #blocked> loc(#loc134) + %new_weight = arith.addf %arg9, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc156) + %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32, #blocked> loc(#loc136) + %new_mean_18 = arith.addf %arg7, %new_mean : tensor<1x2048xf32, #blocked> loc(#loc157) + %new_m2 = arith.subf %tmp0_16, %new_mean_18 : tensor<1x2048xf32, #blocked> loc(#loc138) + %new_m2_19 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32, #blocked> loc(#loc139) + %new_m2_20 = arith.addf %arg8, %new_m2_19 : tensor<1x2048xf32, #blocked> loc(#loc158) + scf.yield %new_m2_20, %new_mean_18, %new_weight : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc141) + } loc(#loc88) + %tmp3_mean = arith.select %tmp0_14, %3#1, %arg7 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc97) + %tmp3_m2 = arith.select %tmp0_14, %3#0, %arg8 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc98) + %tmp3_weight_17 = arith.select %tmp0_14, %3#2, %arg9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc99) + scf.yield %tmp3_mean, %tmp3_m2, %tmp3_weight_17 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc28) + } loc(#loc154) + %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc30)), %arg7: f32 loc(callsite(#loc1 at #loc30)), %arg8: f32 loc(callsite(#loc1 at #loc30)), %arg9: f32 loc(callsite(#loc1 at #loc30)), %arg10: f32 loc(callsite(#loc1 at #loc30)), %arg11: f32 loc(callsite(#loc1 at #loc30))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc142) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc143) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc144) + %w2_over_w_10 = arith.divf %arg11, %new_weight : f32 loc(#loc145) + %w2_over_w_11 = arith.select %w2_over_w, %cst_1, %w2_over_w_10 : f32 loc(#loc146) + %2 = arith.mulf %delta, %w2_over_w_11 : f32 loc(#loc147) + %3 = arith.addf %arg6, %2 : f32 loc(#loc148) + %4 = arith.addf %arg7, %arg10 : f32 loc(#loc149) + %5 = arith.mulf %delta, %delta : f32 loc(#loc150) + %6 = arith.mulf %5, %arg8 : f32 loc(#loc151) + %7 = arith.mulf %6, %w2_over_w_11 : f32 loc(#loc152) + %8 = arith.addf %4, %7 : f32 loc(#loc153) + tt.reduce.return %3, %8, %new_weight : f32, f32, f32 loc(#loc100) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc100) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc107) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc108) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc109) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc110) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc111) + %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc112) + %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc113) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc114) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc115) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc52) + scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc116) + %r0_index_10 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc116) + %r0_mask = arith.cmpi slt, %r0_index_10, %cst : tensor<1x2048xi32, #blocked> loc(#loc117) + %tmp9_11 = tt.addptr %tmp9, %r0_index_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc109) + %tmp9_12 = tt.load %tmp9_11, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc118) + %tmp9_13 = arith.extf %tmp9_12 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc119) + %tmp12 = arith.addi %r0_index_10, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc120) + %tmp12_14 = tt.addptr %tmp0_8, %tmp12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc121) + %tmp12_15 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc122) + %tmp12_16 = tt.load %tmp12_14, %tmp12_15, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc123) + %tmp12_17 = arith.extf %tmp12_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc124) + %tmp23_18 = tt.addptr %tmp23, %r0_index_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc110) + %tmp23_19 = tt.load %tmp23_18, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc125) + %tmp23_20 = arith.extf %tmp23_19 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc126) + %tmp11 = arith.addf %tmp9_13, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc127) + %tmp14_21 = arith.subf %tmp12_17, %tmp14 : tensor<1x2048xf32, #blocked> loc(#loc111) + %tmp20_22 = arith.mulf %tmp14_21, %tmp20 : tensor<1x2048xf32, #blocked> loc(#loc115) + %tmp22 = arith.mulf %tmp11, %tmp20_22 : tensor<1x2048xf32, #blocked> loc(#loc128) + %tmp24 = arith.addf %tmp22, %tmp23_20 : tensor<1x2048xf32, #blocked> loc(#loc129) + %2 = tt.addptr %1, %tmp12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc52) + %3 = arith.truncf %tmp24 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc68) + tt.store %2, %3, %tmp12_15 : tensor<1x2048x!tt.ptr, #blocked> loc(#loc68) + } loc(#loc53) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":32:43) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":33:31) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:62) +#loc15 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:51) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:58) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:8) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:43) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":52:31) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":53:29) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:42) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:35) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:62) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4) +#loc76 = loc("xoffset"(#loc2)) +#loc77 = loc("xmask"(#loc3)) +#loc78 = loc("r0_base"(#loc4)) +#loc79 = loc("tmp0"(#loc5)) +#loc80 = loc("tmp0"(#loc6)) +#loc81 = loc("tmp0"(#loc7)) +#loc82 = loc("tmp0"(#loc8)) +#loc83 = loc("tmp3_mean"(#loc9)) +#loc84 = loc("r0_index"(#loc10)) +#loc85 = loc("r0_mask"(#loc11)) +#loc86 = loc("tmp0"(#loc12)) +#loc87 = loc("tmp0"(#loc13)) +#loc88 = loc(callsite(#loc15 at #loc16)) +#loc89 = loc("new_m2"(#loc17)) +#loc90 = loc("delta"(#loc18)) +#loc91 = loc("new_weight"(#loc19)) +#loc92 = loc("new_mean"(#loc20)) +#loc93 = loc("new_mean"(#loc21)) +#loc94 = loc("new_m2"(#loc22)) +#loc95 = loc("new_m2"(#loc23)) +#loc96 = loc("new_m2"(#loc24)) +#loc97 = loc("tmp3_mean"(#loc25)) +#loc98 = loc("tmp3_m2"(#loc26)) +#loc99 = loc("tmp3_weight"(#loc27)) +#loc100 = loc(callsite(#loc29 at #loc30)) +#loc102 = loc("delta"(#loc31)) +#loc103 = loc("new_weight"(#loc32)) +#loc104 = loc("w2_over_w"(#loc33)) +#loc105 = loc("w2_over_w"(#loc34)) +#loc106 = loc("w2_over_w"(#loc35)) +#loc107 = loc("tmp3"(#loc43)) +#loc108 = loc("tmp7"(#loc44)) +#loc109 = loc("tmp9"(#loc45)) +#loc110 = loc("tmp23"(#loc46)) +#loc111 = loc("tmp14"(#loc47)) +#loc112 = loc("tmp16"(#loc48)) +#loc113 = loc("tmp18"(#loc49)) +#loc114 = loc("tmp19"(#loc50)) +#loc115 = loc("tmp20"(#loc51)) +#loc116 = loc("r0_index"(#loc54)) +#loc117 = loc("r0_mask"(#loc55)) +#loc118 = loc("tmp9"(#loc56)) +#loc119 = loc("tmp9"(#loc57)) +#loc120 = loc("tmp12"(#loc58)) +#loc121 = loc("tmp12"(#loc59)) +#loc122 = loc("tmp12"(#loc60)) +#loc123 = loc("tmp12"(#loc61)) +#loc124 = loc("tmp12"(#loc62)) +#loc125 = loc("tmp23"(#loc63)) +#loc126 = loc("tmp23"(#loc64)) +#loc127 = loc("tmp11"(#loc65)) +#loc128 = loc("tmp22"(#loc66)) +#loc129 = loc("tmp24"(#loc67)) +#loc130 = loc(fused[#loc80, #loc79]) +#loc131 = loc(fused[#loc82, #loc77]) +#loc132 = loc("tmp3_m2"(#loc83)) +#loc133 = loc("new_m2"(#loc89)) +#loc134 = loc(callsite(#loc90 at #loc16)) +#loc135 = loc("new_weight"(#loc91)) +#loc136 = loc(callsite(#loc92 at #loc16)) +#loc137 = loc("new_mean"(#loc93)) +#loc138 = loc(callsite(#loc94 at #loc16)) +#loc139 = loc(callsite(#loc95 at #loc16)) +#loc140 = loc("new_m2"(#loc96)) +#loc141 = loc(callsite(#loc96 at #loc16)) +#loc142 = loc(callsite(#loc102 at #loc100)) +#loc143 = loc(callsite(#loc103 at #loc100)) +#loc144 = loc(callsite(#loc104 at #loc100)) +#loc145 = loc(callsite(#loc105 at #loc100)) +#loc146 = loc(callsite(#loc106 at #loc100)) +#loc147 = loc(callsite(#loc36 at #loc100)) +#loc148 = loc(callsite(#loc37 at #loc100)) +#loc149 = loc(callsite(#loc38 at #loc100)) +#loc150 = loc(callsite(#loc39 at #loc100)) +#loc151 = loc(callsite(#loc40 at #loc100)) +#loc152 = loc(callsite(#loc41 at #loc100)) +#loc153 = loc(callsite(#loc42 at #loc100)) +#loc154 = loc("tmp3_weight"(#loc132)) +#loc155 = loc(callsite(#loc133 at #loc16)) +#loc156 = loc(callsite(#loc135 at #loc16)) +#loc157 = loc(callsite(#loc137 at #loc16)) +#loc158 = loc(callsite(#loc140 at #loc16)) diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttir b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..e722b4e9c4228271a37fc17a289b6f1fb69c5e8a --- /dev/null +++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttir @@ -0,0 +1,270 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0) +#loc2 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("in_ptr1"(#loc)) +#loc74 = loc("in_ptr2"(#loc)) +#loc75 = loc("out_ptr2"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc79 = loc(callsite(#loc2 at #loc3)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xmask = arith.constant 256 : i32 loc(#loc78) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc79) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc2) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc2) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc2) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc2) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc2) + %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc2) + %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc2) + %cst_5 = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc80) + %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc78) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc81) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc82) + %tmp3_weight:3 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%tmp3_mean = %cst_0, %tmp3_m2 = %cst_0, %tmp3_weight_8 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc84) + %r0_index_9 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc84) + %r0_mask = arith.cmpi slt, %r0_index_9, %cst_5 : tensor<1x2048xi32> loc(#loc85) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc86) + %tmp0_10 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc135) + %tmp0_11 = arith.addi %r0_index_9, %tmp0_10 : tensor<1x2048xi32> loc(#loc87) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc88) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc88) + %tmp0_14 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc136) + %tmp0_15 = arith.andi %r0_mask, %tmp0_14 : tensor<1x2048xi1> loc(#loc89) + %tmp0_16 = tt.load %tmp0_13, %tmp0_15, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc90) + %tmp0_17 = arith.extf %tmp0_16 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc91) + %1 = arith.cmpi eq, %r0_offset, %c0_i32 : i32 loc(#loc16) + %2:3 = scf.if %1 -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) { + scf.yield %cst_0, %tmp0_17, %cst_4 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc161) + } else { + %delta = arith.subf %tmp0_17, %tmp3_mean : tensor<1x2048xf32> loc(#loc138) + %new_weight = arith.addf %tmp3_weight_8, %cst_4 : tensor<1x2048xf32> loc(#loc162) + %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32> loc(#loc140) + %new_mean_21 = arith.addf %tmp3_mean, %new_mean : tensor<1x2048xf32> loc(#loc163) + %new_m2 = arith.subf %tmp0_17, %new_mean_21 : tensor<1x2048xf32> loc(#loc142) + %new_m2_22 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc143) + %new_m2_23 = arith.addf %tmp3_m2, %new_m2_22 : tensor<1x2048xf32> loc(#loc164) + scf.yield %new_m2_23, %new_mean_21, %new_weight : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc145) + } loc(#loc92) + %tmp3_mean_18 = arith.select %tmp0_15, %2#1, %tmp3_mean : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc101) + %tmp3_m2_19 = arith.select %tmp0_15, %2#0, %tmp3_m2 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc102) + %tmp3_weight_20 = arith.select %tmp0_15, %2#2, %tmp3_weight_8 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc103) + scf.yield %tmp3_mean_18, %tmp3_m2_19, %tmp3_weight_20 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc30) + } loc(#loc160) + %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc2 at #loc3)), %arg7: f32 loc(callsite(#loc2 at #loc3)), %arg8: f32 loc(callsite(#loc2 at #loc3)), %arg9: f32 loc(callsite(#loc2 at #loc3)), %arg10: f32 loc(callsite(#loc2 at #loc3)), %arg11: f32 loc(callsite(#loc2 at #loc3))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc146) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc147) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc148) + %w2_over_w_8 = arith.divf %arg11, %new_weight : f32 loc(#loc149) + %w2_over_w_9 = arith.select %w2_over_w, %cst, %w2_over_w_8 : f32 loc(#loc150) + %1 = arith.mulf %delta, %w2_over_w_9 : f32 loc(#loc151) + %2 = arith.addf %arg6, %1 : f32 loc(#loc152) + %3 = arith.addf %arg7, %arg10 : f32 loc(#loc153) + %4 = arith.mulf %delta, %delta : f32 loc(#loc154) + %5 = arith.mulf %4, %arg8 : f32 loc(#loc155) + %6 = arith.mulf %5, %w2_over_w_9 : f32 loc(#loc156) + %7 = arith.addf %3, %6 : f32 loc(#loc157) + tt.reduce.return %2, %7, %new_weight : f32, f32, f32 loc(#loc104) + }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc104) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc110) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc111) + scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc112) + %r0_index_8 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc112) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst_5 : tensor<1x2048xi32> loc(#loc113) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc114) + %tmp9_9 = tt.addptr %tmp9, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc114) + %tmp9_10 = tt.load %tmp9_9, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc115) + %tmp9_11 = arith.extf %tmp9_10 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc116) + %tmp12 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc117) + %tmp12_12 = tt.splat %tmp12 : i32 -> tensor<1x2048xi32> loc(#loc158) + %tmp12_13 = arith.addi %r0_index_8, %tmp12_12 : tensor<1x2048xi32> loc(#loc118) + %tmp12_14 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc119) + %tmp12_15 = tt.addptr %tmp12_14, %tmp12_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc119) + %tmp12_16 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc159) + %tmp12_17 = arith.andi %r0_mask, %tmp12_16 : tensor<1x2048xi1> loc(#loc120) + %tmp12_18 = tt.load %tmp12_15, %tmp12_17, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc121) + %tmp12_19 = arith.extf %tmp12_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc122) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc123) + %tmp23_20 = tt.addptr %tmp23, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc123) + %tmp23_21 = tt.load %tmp23_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc124) + %tmp23_22 = arith.extf %tmp23_21 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc125) + %tmp11 = arith.addf %tmp9_11, %cst_4 : tensor<1x2048xf32> loc(#loc126) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc127) + %tmp14_23 = arith.subf %tmp12_19, %tmp14 : tensor<1x2048xf32> loc(#loc127) + %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc128) + %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc129) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc130) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc131) + %tmp20_24 = arith.mulf %tmp14_23, %tmp20 : tensor<1x2048xf32> loc(#loc131) + %tmp22 = arith.mulf %tmp11, %tmp20_24 : tensor<1x2048xf32> loc(#loc132) + %tmp24 = arith.addf %tmp22, %tmp23_22 : tensor<1x2048xf32> loc(#loc133) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc69) + %2 = tt.addptr %1, %tmp12_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc69) + %3 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc70) + tt.store %2, %3, %tmp12_17 : tensor<1x2048x!tt.ptr> loc(#loc70) + } loc(#loc46) + tt.return loc(#loc71) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:27) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":32:43) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":33:31) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:62) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:51) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:58) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:8) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:43) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":52:31) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":53:29) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:47) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:42) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:35) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:62) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4) +#loc78 = loc("xmask"(#loc1)) +#loc80 = loc("xoffset"(#loc4)) +#loc81 = loc("r0_base"(#loc5)) +#loc82 = loc("r0_base"(#loc6)) +#loc83 = loc("tmp3_mean"(#loc7)) +#loc84 = loc("r0_index"(#loc8)) +#loc85 = loc("r0_mask"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp0"(#loc13)) +#loc90 = loc("tmp0"(#loc14)) +#loc91 = loc("tmp0"(#loc15)) +#loc92 = loc(callsite(#loc17 at #loc18)) +#loc93 = loc("new_m2"(#loc19)) +#loc94 = loc("delta"(#loc20)) +#loc95 = loc("new_weight"(#loc21)) +#loc96 = loc("new_mean"(#loc22)) +#loc97 = loc("new_mean"(#loc23)) +#loc98 = loc("new_m2"(#loc24)) +#loc99 = loc("new_m2"(#loc25)) +#loc100 = loc("new_m2"(#loc26)) +#loc101 = loc("tmp3_mean"(#loc27)) +#loc102 = loc("tmp3_m2"(#loc28)) +#loc103 = loc("tmp3_weight"(#loc29)) +#loc104 = loc(callsite(#loc31 at #loc3)) +#loc105 = loc("delta"(#loc32)) +#loc106 = loc("new_weight"(#loc33)) +#loc107 = loc("w2_over_w"(#loc34)) +#loc108 = loc("w2_over_w"(#loc35)) +#loc109 = loc("w2_over_w"(#loc36)) +#loc110 = loc("tmp3"(#loc44)) +#loc111 = loc("tmp7"(#loc45)) +#loc112 = loc("r0_index"(#loc47)) +#loc113 = loc("r0_mask"(#loc48)) +#loc114 = loc("tmp9"(#loc49)) +#loc115 = loc("tmp9"(#loc50)) +#loc116 = loc("tmp9"(#loc51)) +#loc117 = loc("tmp12"(#loc52)) +#loc118 = loc("tmp12"(#loc53)) +#loc119 = loc("tmp12"(#loc54)) +#loc120 = loc("tmp12"(#loc55)) +#loc121 = loc("tmp12"(#loc56)) +#loc122 = loc("tmp12"(#loc57)) +#loc123 = loc("tmp23"(#loc58)) +#loc124 = loc("tmp23"(#loc59)) +#loc125 = loc("tmp23"(#loc60)) +#loc126 = loc("tmp11"(#loc61)) +#loc127 = loc("tmp14"(#loc62)) +#loc128 = loc("tmp16"(#loc63)) +#loc129 = loc("tmp18"(#loc64)) +#loc130 = loc("tmp19"(#loc65)) +#loc131 = loc("tmp20"(#loc66)) +#loc132 = loc("tmp22"(#loc67)) +#loc133 = loc("tmp24"(#loc68)) +#loc134 = loc("tmp3_m2"(#loc83)) +#loc135 = loc(fused[#loc87, #loc86]) +#loc136 = loc(fused[#loc89, #loc78]) +#loc137 = loc("new_m2"(#loc93)) +#loc138 = loc(callsite(#loc94 at #loc18)) +#loc139 = loc("new_weight"(#loc95)) +#loc140 = loc(callsite(#loc96 at #loc18)) +#loc141 = loc("new_mean"(#loc97)) +#loc142 = loc(callsite(#loc98 at #loc18)) +#loc143 = loc(callsite(#loc99 at #loc18)) +#loc144 = loc("new_m2"(#loc100)) +#loc145 = loc(callsite(#loc100 at #loc18)) +#loc146 = loc(callsite(#loc105 at #loc104)) +#loc147 = loc(callsite(#loc106 at #loc104)) +#loc148 = loc(callsite(#loc107 at #loc104)) +#loc149 = loc(callsite(#loc108 at #loc104)) +#loc150 = loc(callsite(#loc109 at #loc104)) +#loc151 = loc(callsite(#loc37 at #loc104)) +#loc152 = loc(callsite(#loc38 at #loc104)) +#loc153 = loc(callsite(#loc39 at #loc104)) +#loc154 = loc(callsite(#loc40 at #loc104)) +#loc155 = loc(callsite(#loc41 at #loc104)) +#loc156 = loc(callsite(#loc42 at #loc104)) +#loc157 = loc(callsite(#loc43 at #loc104)) +#loc158 = loc(fused[#loc118, #loc117]) +#loc159 = loc(fused[#loc120, #loc78]) +#loc160 = loc("tmp3_weight"(#loc134)) +#loc161 = loc(callsite(#loc137 at #loc18)) +#loc162 = loc(callsite(#loc139 at #loc18)) +#loc163 = loc(callsite(#loc141 at #loc18)) +#loc164 = loc(callsite(#loc144 at #loc18)) diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ae5254f06ac9f186cb062a38646ab9e0bd960c61 --- /dev/null +++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.json"}} \ No newline at end of file diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a4544a8fecd0787b5b68e585fe88b0d99c133d5b Binary files /dev/null and b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a3bd8cf92daf3a29a1ddc54bfb67c7d6c8e4c0af --- /dev/null +++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"hash": "0b08e56f903fce8944ebc8ac39910229dacc369441ab2a54d627e30cd636ae5e", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"} \ No newline at end of file diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..6b266600e9d26ce4bd1e6bc1befefc354adba5ac --- /dev/null +++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.llir @@ -0,0 +1,789 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10 + %15 = mul nuw i32 %13, %14, !dbg !11 + %16 = add nuw i32 %15, %12, !dbg !12 + %17 = shl i32 %16, 3, !dbg !13 + %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14 + %19 = and i32 %18, 112, !dbg !14 + %20 = lshr exact i32 %19, 4, !dbg !14 + %21 = and i32 %18, 1, !dbg !14 + %22 = shl nuw nsw i32 %21, 2, !dbg !14 + %23 = or disjoint i32 %17, %20, !dbg !15 + %24 = or disjoint i32 %17, %22, !dbg !15 + %25 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16 + %26 = shl i32 %25, 7, !dbg !17 + %27 = shl nuw nsw i32 %18, 3, !dbg !18 + %28 = and i32 %27, 120, !dbg !18 + %29 = lshr i32 %18, 1, !dbg !18 + %30 = and i32 %29, 63, !dbg !18 + %31 = or disjoint i32 %28, %26, !dbg !19 + %32 = or disjoint i32 %30, %26, !dbg !19 + %33 = icmp slt i32 %31, 128, !dbg !20 + %34 = icmp slt i32 %32, 128, !dbg !20 + %35 = sdiv i32 %23, 32, !dbg !21 + %36 = sdiv i32 %24, 32, !dbg !21 + %37 = mul i32 %35, 32, !dbg !22 + %.decomposed = sub i32 %23, %37, !dbg !22 + %38 = mul i32 %36, 32, !dbg !22 + %.decomposed72 = sub i32 %24, %38, !dbg !22 + %39 = icmp slt i32 %23, 8192, !dbg !23 + %40 = icmp slt i32 %24, 8192, !dbg !23 + %41 = shl nsw i32 %.decomposed, 7, !dbg !24 + %42 = add i32 %41, %31, !dbg !25 + %43 = mul i32 %35, 12288, !dbg !26 + %44 = add i32 %42, %43, !dbg !27 + %45 = sext i32 %44 to i64, !dbg !28 + %46 = getelementptr bfloat, ptr addrspace(1) %0, i64 %45, !dbg !28 + %47 = and i1 %33, %39, !dbg !29 + %48 = and i1 %34, %40, !dbg !29 + %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !30 + %50 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %46, i64 %49, i1 %47) #5, !dbg !30 + %51 = extractvalue { i32, i32, i32, i32 } %50, 0, !dbg !30 + %52 = extractvalue { i32, i32, i32, i32 } %50, 1, !dbg !30 + %53 = extractvalue { i32, i32, i32, i32 } %50, 2, !dbg !30 + %54 = extractvalue { i32, i32, i32, i32 } %50, 3, !dbg !30 + %extelt.offset = lshr i32 %51, 16, !dbg !30 + %55 = trunc nuw i32 %extelt.offset to i16, !dbg !30 + %extelt.offset1 = lshr i32 %52, 16, !dbg !30 + %56 = trunc nuw i32 %extelt.offset1 to i16, !dbg !30 + %extelt.offset2 = lshr i32 %53, 16, !dbg !30 + %57 = trunc nuw i32 %extelt.offset2 to i16, !dbg !30 + %extelt.offset3 = lshr i32 %54, 16, !dbg !30 + %58 = trunc nuw i32 %extelt.offset3 to i16, !dbg !30 + %59 = shl nuw nsw i32 %18, 4, !dbg !31 + %60 = and i32 %59, 112, !dbg !31 + %61 = and i32 %18, 8, !dbg !31 + %62 = icmp eq i32 %61, 0, !dbg !31 + %63 = lshr exact i32 %61, 1, !dbg !31 + %64 = and i32 %18, 16, !dbg !31 + %65 = icmp eq i32 %64, 0, !dbg !31 + %66 = select i1 %65, i32 0, i32 136, !dbg !31 + %67 = and i32 %18, 32, !dbg !31 + %68 = lshr exact i32 %67, 4, !dbg !31 + %69 = lshr i32 %18, 3, !dbg !31 + %70 = and i32 %69, 8, !dbg !31 + %71 = or disjoint i32 %63, %68, !dbg !31 + %72 = or disjoint i32 %66, %60, !dbg !31 + %73 = xor i32 %72, %70, !dbg !31 + %74 = or disjoint i32 %71, %73, !dbg !31 + %75 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %74, !dbg !31 + %76 = trunc i32 %51 to i16, !dbg !31 + %77 = insertelement <1 x i16> poison, i16 %76, i64 0, !dbg !31 + store <1 x i16> %77, ptr addrspace(3) %75, align 2, !dbg !31 + %78 = xor i32 %74, 288, !dbg !31 + %79 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %78, !dbg !31 + %80 = insertelement <1 x i16> poison, i16 %55, i64 0, !dbg !31 + store <1 x i16> %80, ptr addrspace(3) %79, align 2, !dbg !31 + %81 = xor i32 %74, 576, !dbg !31 + %82 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %81, !dbg !31 + %83 = trunc i32 %52 to i16, !dbg !31 + %84 = insertelement <1 x i16> poison, i16 %83, i64 0, !dbg !31 + store <1 x i16> %84, ptr addrspace(3) %82, align 2, !dbg !31 + %85 = xor i32 %74, 864, !dbg !31 + %86 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %85, !dbg !31 + %87 = insertelement <1 x i16> poison, i16 %56, i64 0, !dbg !31 + store <1 x i16> %87, ptr addrspace(3) %86, align 2, !dbg !31 + %88 = xor i32 %74, 1028, !dbg !31 + %89 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %88, !dbg !31 + %90 = trunc i32 %53 to i16, !dbg !31 + %91 = insertelement <1 x i16> poison, i16 %90, i64 0, !dbg !31 + store <1 x i16> %91, ptr addrspace(3) %89, align 2, !dbg !31 + %92 = xor i32 %74, 1316, !dbg !31 + %93 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %92, !dbg !31 + %94 = insertelement <1 x i16> poison, i16 %57, i64 0, !dbg !31 + store <1 x i16> %94, ptr addrspace(3) %93, align 2, !dbg !31 + %95 = xor i32 %74, 1604, !dbg !31 + %96 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %95, !dbg !31 + %97 = trunc i32 %54 to i16, !dbg !31 + %98 = insertelement <1 x i16> poison, i16 %97, i64 0, !dbg !31 + store <1 x i16> %98, ptr addrspace(3) %96, align 2, !dbg !31 + %99 = xor i32 %74, 1892, !dbg !31 + %100 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %99, !dbg !31 + %101 = insertelement <1 x i16> poison, i16 %58, i64 0, !dbg !31 + store <1 x i16> %101, ptr addrspace(3) %100, align 2, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31 + %102 = and i32 %18, 6, !dbg !31 + %103 = shl nuw nsw i32 %21, 3, !dbg !31 + %104 = select i1 %62, i32 0, i32 1028, !dbg !31 + %105 = mul nuw nsw i32 %102, 144, !dbg !31 + %106 = xor i32 %105, %19, !dbg !31 + %107 = or disjoint i32 %104, %106, !dbg !31 + %108 = or disjoint i32 %107, %103, !dbg !31 + %109 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %108, !dbg !31 + %110 = load bfloat, ptr addrspace(3) %109, align 4, !dbg !31 + %111 = getelementptr inbounds nuw i8, ptr addrspace(3) %109, i32 2, !dbg !31 + %112 = load bfloat, ptr addrspace(3) %111, align 2, !dbg !31 + %113 = xor i32 %108, 136, !dbg !31 + %114 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %113, !dbg !31 + %115 = load bfloat, ptr addrspace(3) %114, align 4, !dbg !31 + %116 = getelementptr inbounds nuw i8, ptr addrspace(3) %114, i32 2, !dbg !31 + %117 = load bfloat, ptr addrspace(3) %116, align 2, !dbg !31 + %118 = xor i32 %108, 4, !dbg !31 + %119 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %118, !dbg !31 + %120 = load bfloat, ptr addrspace(3) %119, align 4, !dbg !31 + %121 = getelementptr inbounds nuw i8, ptr addrspace(3) %119, i32 2, !dbg !31 + %122 = load bfloat, ptr addrspace(3) %121, align 2, !dbg !31 + %123 = xor i32 %108, 140, !dbg !31 + %124 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %123, !dbg !31 + %125 = load bfloat, ptr addrspace(3) %124, align 4, !dbg !31 + %126 = getelementptr inbounds nuw i8, ptr addrspace(3) %124, i32 2, !dbg !31 + %127 = load bfloat, ptr addrspace(3) %126, align 2, !dbg !31 + %128 = fpext bfloat %110 to float, !dbg !31 + %129 = fpext bfloat %115 to float, !dbg !31 + %130 = fpext bfloat %112 to float, !dbg !31 + %131 = fpext bfloat %117 to float, !dbg !31 + %132 = fpext bfloat %120 to float, !dbg !31 + %133 = fpext bfloat %125 to float, !dbg !31 + %134 = fpext bfloat %122 to float, !dbg !31 + %135 = fpext bfloat %127 to float, !dbg !31 + %136 = sext i32 %24 to i64, !dbg !32 + %137 = getelementptr float, ptr addrspace(1) %1, i64 %136, !dbg !32 + %138 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !33 + %139 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %137, i64 %138, i1 %48) #5, !dbg !33 + %140 = extractvalue { i32, i32, i32, i32 } %139, 0, !dbg !33 + %141 = extractvalue { i32, i32, i32, i32 } %139, 1, !dbg !33 + %142 = extractvalue { i32, i32, i32, i32 } %139, 2, !dbg !33 + %143 = extractvalue { i32, i32, i32, i32 } %139, 3, !dbg !33 + %144 = bitcast i32 %140 to float, !dbg !33 + %145 = bitcast i32 %141 to float, !dbg !33 + %146 = bitcast i32 %142 to float, !dbg !33 + %147 = bitcast i32 %143 to float, !dbg !33 + %148 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !33 + %149 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %137, i64 %148, i1 %48) #5, !dbg !33 + %150 = extractvalue { i32, i32, i32, i32 } %149, 0, !dbg !33 + %151 = extractvalue { i32, i32, i32, i32 } %149, 1, !dbg !33 + %152 = extractvalue { i32, i32, i32, i32 } %149, 2, !dbg !33 + %153 = extractvalue { i32, i32, i32, i32 } %149, 3, !dbg !33 + %154 = bitcast i32 %150 to float, !dbg !33 + %155 = bitcast i32 %151 to float, !dbg !33 + %156 = bitcast i32 %152 to float, !dbg !33 + %157 = bitcast i32 %153 to float, !dbg !33 + %158 = tail call float @llvm.nvvm.div.full(float %144, float 1.280000e+02), !dbg !34 + %159 = tail call float @llvm.nvvm.div.full(float %145, float 1.280000e+02), !dbg !34 + %160 = tail call float @llvm.nvvm.div.full(float %146, float 1.280000e+02), !dbg !34 + %161 = tail call float @llvm.nvvm.div.full(float %147, float 1.280000e+02), !dbg !34 + %162 = tail call float @llvm.nvvm.div.full(float %154, float 1.280000e+02), !dbg !34 + %163 = tail call float @llvm.nvvm.div.full(float %155, float 1.280000e+02), !dbg !34 + %164 = tail call float @llvm.nvvm.div.full(float %156, float 1.280000e+02), !dbg !34 + %165 = tail call float @llvm.nvvm.div.full(float %157, float 1.280000e+02), !dbg !34 + %166 = fadd float %158, 0x3EB0C6F7A0000000, !dbg !35 + %167 = fadd float %159, 0x3EB0C6F7A0000000, !dbg !35 + %168 = fadd float %160, 0x3EB0C6F7A0000000, !dbg !35 + %169 = fadd float %161, 0x3EB0C6F7A0000000, !dbg !35 + %170 = fadd float %162, 0x3EB0C6F7A0000000, !dbg !35 + %171 = fadd float %163, 0x3EB0C6F7A0000000, !dbg !35 + %172 = fadd float %164, 0x3EB0C6F7A0000000, !dbg !35 + %173 = fadd float %165, 0x3EB0C6F7A0000000, !dbg !35 + %174 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i = icmp eq i32 %174, 0, !dbg !36 + br i1 %.not.i, label %177, label %175, !dbg !36 + +175: ; preds = %11 + %176 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %166), !dbg !36 + br label %__nv_rsqrtf.exit, !dbg !36 + +177: ; preds = %11 + %178 = tail call float @llvm.nvvm.rsqrt.approx.f(float %166), !dbg !36 + br label %__nv_rsqrtf.exit, !dbg !36 + +__nv_rsqrtf.exit: ; preds = %175, %177 + %.0.i = phi float [ %176, %175 ], [ %178, %177 ], !dbg !36 + %179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i27 = icmp eq i32 %179, 0, !dbg !36 + br i1 %.not.i27, label %182, label %180, !dbg !36 + +180: ; preds = %__nv_rsqrtf.exit + %181 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %167), !dbg !36 + br label %__nv_rsqrtf.exit29, !dbg !36 + +182: ; preds = %__nv_rsqrtf.exit + %183 = tail call float @llvm.nvvm.rsqrt.approx.f(float %167), !dbg !36 + br label %__nv_rsqrtf.exit29, !dbg !36 + +__nv_rsqrtf.exit29: ; preds = %180, %182 + %.0.i28 = phi float [ %181, %180 ], [ %183, %182 ], !dbg !36 + %184 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i30 = icmp eq i32 %184, 0, !dbg !36 + br i1 %.not.i30, label %187, label %185, !dbg !36 + +185: ; preds = %__nv_rsqrtf.exit29 + %186 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %168), !dbg !36 + br label %__nv_rsqrtf.exit32, !dbg !36 + +187: ; preds = %__nv_rsqrtf.exit29 + %188 = tail call float @llvm.nvvm.rsqrt.approx.f(float %168), !dbg !36 + br label %__nv_rsqrtf.exit32, !dbg !36 + +__nv_rsqrtf.exit32: ; preds = %185, %187 + %.0.i31 = phi float [ %186, %185 ], [ %188, %187 ], !dbg !36 + %189 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i33 = icmp eq i32 %189, 0, !dbg !36 + br i1 %.not.i33, label %192, label %190, !dbg !36 + +190: ; preds = %__nv_rsqrtf.exit32 + %191 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %169), !dbg !36 + br label %__nv_rsqrtf.exit35, !dbg !36 + +192: ; preds = %__nv_rsqrtf.exit32 + %193 = tail call float @llvm.nvvm.rsqrt.approx.f(float %169), !dbg !36 + br label %__nv_rsqrtf.exit35, !dbg !36 + +__nv_rsqrtf.exit35: ; preds = %190, %192 + %.0.i34 = phi float [ %191, %190 ], [ %193, %192 ], !dbg !36 + %194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i36 = icmp eq i32 %194, 0, !dbg !36 + br i1 %.not.i36, label %197, label %195, !dbg !36 + +195: ; preds = %__nv_rsqrtf.exit35 + %196 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %170), !dbg !36 + br label %__nv_rsqrtf.exit38, !dbg !36 + +197: ; preds = %__nv_rsqrtf.exit35 + %198 = tail call float @llvm.nvvm.rsqrt.approx.f(float %170), !dbg !36 + br label %__nv_rsqrtf.exit38, !dbg !36 + +__nv_rsqrtf.exit38: ; preds = %195, %197 + %.0.i37 = phi float [ %196, %195 ], [ %198, %197 ], !dbg !36 + %199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i39 = icmp eq i32 %199, 0, !dbg !36 + br i1 %.not.i39, label %202, label %200, !dbg !36 + +200: ; preds = %__nv_rsqrtf.exit38 + %201 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %171), !dbg !36 + br label %__nv_rsqrtf.exit41, !dbg !36 + +202: ; preds = %__nv_rsqrtf.exit38 + %203 = tail call float @llvm.nvvm.rsqrt.approx.f(float %171), !dbg !36 + br label %__nv_rsqrtf.exit41, !dbg !36 + +__nv_rsqrtf.exit41: ; preds = %200, %202 + %.0.i40 = phi float [ %201, %200 ], [ %203, %202 ], !dbg !36 + %204 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i42 = icmp eq i32 %204, 0, !dbg !36 + br i1 %.not.i42, label %207, label %205, !dbg !36 + +205: ; preds = %__nv_rsqrtf.exit41 + %206 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %172), !dbg !36 + br label %__nv_rsqrtf.exit44, !dbg !36 + +207: ; preds = %__nv_rsqrtf.exit41 + %208 = tail call float @llvm.nvvm.rsqrt.approx.f(float %172), !dbg !36 + br label %__nv_rsqrtf.exit44, !dbg !36 + +__nv_rsqrtf.exit44: ; preds = %205, %207 + %.0.i43 = phi float [ %206, %205 ], [ %208, %207 ], !dbg !36 + %209 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i45 = icmp eq i32 %209, 0, !dbg !36 + br i1 %.not.i45, label %212, label %210, !dbg !36 + +210: ; preds = %__nv_rsqrtf.exit44 + %211 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %173), !dbg !36 + br label %__nv_rsqrtf.exit47, !dbg !36 + +212: ; preds = %__nv_rsqrtf.exit44 + %213 = tail call float @llvm.nvvm.rsqrt.approx.f(float %173), !dbg !36 + br label %__nv_rsqrtf.exit47, !dbg !36 + +__nv_rsqrtf.exit47: ; preds = %210, %212 + %.0.i46 = phi float [ %211, %210 ], [ %213, %212 ], !dbg !36 + %214 = fmul float %.0.i, %128, !dbg !37 + %215 = fmul float %.0.i28, %129, !dbg !37 + %216 = fmul float %.0.i31, %130, !dbg !37 + %217 = fmul float %.0.i34, %131, !dbg !37 + %218 = fmul float %.0.i37, %132, !dbg !37 + %219 = fmul float %.0.i40, %133, !dbg !37 + %220 = fmul float %.0.i43, %134, !dbg !37 + %221 = fmul float %.0.i46, %135, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %222 = select i1 %62, i32 0, i32 2052, !dbg !37 + %223 = mul nuw nsw i32 %102, 272, !dbg !37 + %224 = xor i32 %223, %19, !dbg !37 + %225 = or disjoint i32 %222, %224, !dbg !37 + %226 = or disjoint i32 %225, %103, !dbg !37 + %227 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %226, !dbg !37 + store float %214, ptr addrspace(3) %227, align 4, !dbg !37 + %228 = getelementptr inbounds nuw i8, ptr addrspace(3) %227, i32 128, !dbg !37 + store float %216, ptr addrspace(3) %228, align 4, !dbg !37 + %229 = xor i32 %226, 264, !dbg !37 + %230 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %229, !dbg !37 + store float %215, ptr addrspace(3) %230, align 4, !dbg !37 + %231 = getelementptr inbounds nuw i8, ptr addrspace(3) %230, i32 128, !dbg !37 + store float %217, ptr addrspace(3) %231, align 4, !dbg !37 + %232 = xor i32 %226, 4, !dbg !37 + %233 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %232, !dbg !37 + store float %218, ptr addrspace(3) %233, align 4, !dbg !37 + %234 = getelementptr inbounds nuw i8, ptr addrspace(3) %233, i32 128, !dbg !37 + store float %220, ptr addrspace(3) %234, align 4, !dbg !37 + %235 = xor i32 %226, 268, !dbg !37 + %236 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %235, !dbg !37 + store float %219, ptr addrspace(3) %236, align 4, !dbg !37 + %237 = getelementptr inbounds nuw i8, ptr addrspace(3) %236, i32 128, !dbg !37 + store float %221, ptr addrspace(3) %237, align 4, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %238 = and i32 %59, 368, !dbg !37 + %239 = and i32 %29, 12, !dbg !37 + %240 = shl nuw nsw i32 %67, 2, !dbg !37 + %241 = or disjoint i32 %238, %239, !dbg !37 + %242 = xor i32 %241, %70, !dbg !37 + %243 = or disjoint i32 %242, %240, !dbg !37 + %244 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %243, !dbg !37 + %245 = load float, ptr addrspace(3) %244, align 4, !dbg !37 + %246 = xor i32 %243, 544, !dbg !37 + %247 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %246, !dbg !37 + %248 = load float, ptr addrspace(3) %247, align 4, !dbg !37 + %249 = xor i32 %243, 1088, !dbg !37 + %250 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %249, !dbg !37 + %251 = load float, ptr addrspace(3) %250, align 4, !dbg !37 + %252 = xor i32 %243, 1632, !dbg !37 + %253 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %252, !dbg !37 + %254 = load float, ptr addrspace(3) %253, align 4, !dbg !37 + %255 = xor i32 %243, 2052, !dbg !37 + %256 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %255, !dbg !37 + %257 = load float, ptr addrspace(3) %256, align 4, !dbg !37 + %258 = xor i32 %243, 2596, !dbg !37 + %259 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %258, !dbg !37 + %260 = load float, ptr addrspace(3) %259, align 4, !dbg !37 + %261 = xor i32 %243, 3140, !dbg !37 + %262 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %261, !dbg !37 + %263 = load float, ptr addrspace(3) %262, align 4, !dbg !37 + %264 = xor i32 %243, 3684, !dbg !37 + %265 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %264, !dbg !37 + %266 = load float, ptr addrspace(3) %265, align 4, !dbg !37 + %267 = sext i32 %31 to i64, !dbg !38 + %268 = getelementptr bfloat, ptr addrspace(1) %2, i64 %267, !dbg !38 + %269 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !39 + %270 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %268, i64 %269, i1 %47) #5, !dbg !39 + %271 = add i32 %44, -3145728, !dbg !40 + %272 = sext i32 %271 to i64, !dbg !41 + %273 = getelementptr bfloat, ptr addrspace(1) %3, i64 %272, !dbg !41 + %274 = add i32 %17, -8192, !dbg !42 + %275 = icmp ult i32 %274, 65536, !dbg !42 + %276 = and i1 %33, %275, !dbg !42 + %277 = and i1 %34, %275, !dbg !42 + %278 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !43 + %279 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %273, i64 %278, i1 %276) #5, !dbg !43 + %280 = extractvalue { i32, i32, i32, i32 } %279, 0, !dbg !43 + %281 = extractvalue { i32, i32, i32, i32 } %279, 1, !dbg !43 + %282 = extractvalue { i32, i32, i32, i32 } %279, 2, !dbg !43 + %283 = extractvalue { i32, i32, i32, i32 } %279, 3, !dbg !43 + %extelt.offset12 = lshr i32 %280, 16, !dbg !43 + %284 = trunc nuw i32 %extelt.offset12 to i16, !dbg !43 + %extelt.offset14 = lshr i32 %281, 16, !dbg !43 + %285 = trunc nuw i32 %extelt.offset14 to i16, !dbg !43 + %extelt.offset16 = lshr i32 %282, 16, !dbg !43 + %286 = trunc nuw i32 %extelt.offset16 to i16, !dbg !43 + %extelt.offset18 = lshr i32 %283, 16, !dbg !43 + %287 = trunc nuw i32 %extelt.offset18 to i16, !dbg !43 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44 + %288 = trunc i32 %280 to i16, !dbg !44 + %289 = insertelement <1 x i16> poison, i16 %288, i64 0, !dbg !44 + store <1 x i16> %289, ptr addrspace(3) %75, align 2, !dbg !44 + %290 = insertelement <1 x i16> poison, i16 %284, i64 0, !dbg !44 + store <1 x i16> %290, ptr addrspace(3) %79, align 2, !dbg !44 + %291 = trunc i32 %281 to i16, !dbg !44 + %292 = insertelement <1 x i16> poison, i16 %291, i64 0, !dbg !44 + store <1 x i16> %292, ptr addrspace(3) %82, align 2, !dbg !44 + %293 = insertelement <1 x i16> poison, i16 %285, i64 0, !dbg !44 + store <1 x i16> %293, ptr addrspace(3) %86, align 2, !dbg !44 + %294 = trunc i32 %282 to i16, !dbg !44 + %295 = insertelement <1 x i16> poison, i16 %294, i64 0, !dbg !44 + store <1 x i16> %295, ptr addrspace(3) %89, align 2, !dbg !44 + %296 = insertelement <1 x i16> poison, i16 %286, i64 0, !dbg !44 + store <1 x i16> %296, ptr addrspace(3) %93, align 2, !dbg !44 + %297 = trunc i32 %283 to i16, !dbg !44 + %298 = insertelement <1 x i16> poison, i16 %297, i64 0, !dbg !44 + store <1 x i16> %298, ptr addrspace(3) %96, align 2, !dbg !44 + %299 = insertelement <1 x i16> poison, i16 %287, i64 0, !dbg !44 + store <1 x i16> %299, ptr addrspace(3) %100, align 2, !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44 + %300 = load <2 x bfloat>, ptr addrspace(3) %109, align 4, !dbg !44 + %301 = load <2 x bfloat>, ptr addrspace(3) %114, align 4, !dbg !44 + %302 = load <2 x bfloat>, ptr addrspace(3) %119, align 4, !dbg !44 + %303 = load <2 x bfloat>, ptr addrspace(3) %124, align 4, !dbg !44 + %304 = shl nsw i32 %36, 5, !dbg !45 + %305 = add nsw i32 %.decomposed72, -8192, !dbg !45 + %306 = add i32 %305, %304, !dbg !46 + %307 = sext i32 %306 to i64, !dbg !47 + %308 = getelementptr float, ptr addrspace(1) %4, i64 %307, !dbg !47 + %309 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !48 + %310 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %308, i64 %309, i1 %277) #5, !dbg !48 + %311 = extractvalue { i32, i32, i32, i32 } %310, 0, !dbg !48 + %312 = extractvalue { i32, i32, i32, i32 } %310, 1, !dbg !48 + %313 = extractvalue { i32, i32, i32, i32 } %310, 2, !dbg !48 + %314 = extractvalue { i32, i32, i32, i32 } %310, 3, !dbg !48 + %315 = bitcast i32 %311 to float, !dbg !48 + %316 = bitcast i32 %312 to float, !dbg !48 + %317 = bitcast i32 %313 to float, !dbg !48 + %318 = bitcast i32 %314 to float, !dbg !48 + %319 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !48 + %320 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %308, i64 %319, i1 %277) #5, !dbg !48 + %321 = extractvalue { i32, i32, i32, i32 } %320, 0, !dbg !48 + %322 = extractvalue { i32, i32, i32, i32 } %320, 1, !dbg !48 + %323 = extractvalue { i32, i32, i32, i32 } %320, 2, !dbg !48 + %324 = extractvalue { i32, i32, i32, i32 } %320, 3, !dbg !48 + %325 = bitcast i32 %321 to float, !dbg !48 + %326 = bitcast i32 %322 to float, !dbg !48 + %327 = bitcast i32 %323 to float, !dbg !48 + %328 = bitcast i32 %324 to float, !dbg !48 + %329 = tail call float @llvm.nvvm.div.full(float %315, float 1.280000e+02), !dbg !49 + %330 = tail call float @llvm.nvvm.div.full(float %316, float 1.280000e+02), !dbg !49 + %331 = tail call float @llvm.nvvm.div.full(float %317, float 1.280000e+02), !dbg !49 + %332 = tail call float @llvm.nvvm.div.full(float %318, float 1.280000e+02), !dbg !49 + %333 = tail call float @llvm.nvvm.div.full(float %325, float 1.280000e+02), !dbg !49 + %334 = tail call float @llvm.nvvm.div.full(float %326, float 1.280000e+02), !dbg !49 + %335 = tail call float @llvm.nvvm.div.full(float %327, float 1.280000e+02), !dbg !49 + %336 = tail call float @llvm.nvvm.div.full(float %328, float 1.280000e+02), !dbg !49 + %337 = fadd float %329, 0x3EB0C6F7A0000000, !dbg !50 + %338 = fadd float %330, 0x3EB0C6F7A0000000, !dbg !50 + %339 = fadd float %331, 0x3EB0C6F7A0000000, !dbg !50 + %340 = fadd float %332, 0x3EB0C6F7A0000000, !dbg !50 + %341 = fadd float %333, 0x3EB0C6F7A0000000, !dbg !50 + %342 = fadd float %334, 0x3EB0C6F7A0000000, !dbg !50 + %343 = fadd float %335, 0x3EB0C6F7A0000000, !dbg !50 + %344 = fadd float %336, 0x3EB0C6F7A0000000, !dbg !50 + %345 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i48 = icmp eq i32 %345, 0, !dbg !51 + br i1 %.not.i48, label %348, label %346, !dbg !51 + +346: ; preds = %__nv_rsqrtf.exit47 + %347 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %337), !dbg !51 + br label %__nv_rsqrtf.exit50, !dbg !51 + +348: ; preds = %__nv_rsqrtf.exit47 + %349 = tail call float @llvm.nvvm.rsqrt.approx.f(float %337), !dbg !51 + br label %__nv_rsqrtf.exit50, !dbg !51 + +__nv_rsqrtf.exit50: ; preds = %346, %348 + %.0.i49 = phi float [ %347, %346 ], [ %349, %348 ], !dbg !51 + %350 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i51 = icmp eq i32 %350, 0, !dbg !51 + br i1 %.not.i51, label %353, label %351, !dbg !51 + +351: ; preds = %__nv_rsqrtf.exit50 + %352 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !51 + br label %__nv_rsqrtf.exit53, !dbg !51 + +353: ; preds = %__nv_rsqrtf.exit50 + %354 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !51 + br label %__nv_rsqrtf.exit53, !dbg !51 + +__nv_rsqrtf.exit53: ; preds = %351, %353 + %.0.i52 = phi float [ %352, %351 ], [ %354, %353 ], !dbg !51 + %355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i54 = icmp eq i32 %355, 0, !dbg !51 + br i1 %.not.i54, label %358, label %356, !dbg !51 + +356: ; preds = %__nv_rsqrtf.exit53 + %357 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %339), !dbg !51 + br label %__nv_rsqrtf.exit56, !dbg !51 + +358: ; preds = %__nv_rsqrtf.exit53 + %359 = tail call float @llvm.nvvm.rsqrt.approx.f(float %339), !dbg !51 + br label %__nv_rsqrtf.exit56, !dbg !51 + +__nv_rsqrtf.exit56: ; preds = %356, %358 + %.0.i55 = phi float [ %357, %356 ], [ %359, %358 ], !dbg !51 + %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i57 = icmp eq i32 %360, 0, !dbg !51 + br i1 %.not.i57, label %363, label %361, !dbg !51 + +361: ; preds = %__nv_rsqrtf.exit56 + %362 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %340), !dbg !51 + br label %__nv_rsqrtf.exit59, !dbg !51 + +363: ; preds = %__nv_rsqrtf.exit56 + %364 = tail call float @llvm.nvvm.rsqrt.approx.f(float %340), !dbg !51 + br label %__nv_rsqrtf.exit59, !dbg !51 + +__nv_rsqrtf.exit59: ; preds = %361, %363 + %.0.i58 = phi float [ %362, %361 ], [ %364, %363 ], !dbg !51 + %365 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i60 = icmp eq i32 %365, 0, !dbg !51 + br i1 %.not.i60, label %368, label %366, !dbg !51 + +366: ; preds = %__nv_rsqrtf.exit59 + %367 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %341), !dbg !51 + br label %__nv_rsqrtf.exit62, !dbg !51 + +368: ; preds = %__nv_rsqrtf.exit59 + %369 = tail call float @llvm.nvvm.rsqrt.approx.f(float %341), !dbg !51 + br label %__nv_rsqrtf.exit62, !dbg !51 + +__nv_rsqrtf.exit62: ; preds = %366, %368 + %.0.i61 = phi float [ %367, %366 ], [ %369, %368 ], !dbg !51 + %370 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i63 = icmp eq i32 %370, 0, !dbg !51 + br i1 %.not.i63, label %373, label %371, !dbg !51 + +371: ; preds = %__nv_rsqrtf.exit62 + %372 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %342), !dbg !51 + br label %__nv_rsqrtf.exit65, !dbg !51 + +373: ; preds = %__nv_rsqrtf.exit62 + %374 = tail call float @llvm.nvvm.rsqrt.approx.f(float %342), !dbg !51 + br label %__nv_rsqrtf.exit65, !dbg !51 + +__nv_rsqrtf.exit65: ; preds = %371, %373 + %.0.i64 = phi float [ %372, %371 ], [ %374, %373 ], !dbg !51 + %375 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i66 = icmp eq i32 %375, 0, !dbg !51 + br i1 %.not.i66, label %378, label %376, !dbg !51 + +376: ; preds = %__nv_rsqrtf.exit65 + %377 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %343), !dbg !51 + br label %__nv_rsqrtf.exit68, !dbg !51 + +378: ; preds = %__nv_rsqrtf.exit65 + %379 = tail call float @llvm.nvvm.rsqrt.approx.f(float %343), !dbg !51 + br label %__nv_rsqrtf.exit68, !dbg !51 + +__nv_rsqrtf.exit68: ; preds = %376, %378 + %.0.i67 = phi float [ %377, %376 ], [ %379, %378 ], !dbg !51 + %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i69 = icmp eq i32 %380, 0, !dbg !51 + br i1 %.not.i69, label %383, label %381, !dbg !51 + +381: ; preds = %__nv_rsqrtf.exit68 + %382 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %344), !dbg !51 + br label %__nv_rsqrtf.exit71, !dbg !51 + +383: ; preds = %__nv_rsqrtf.exit68 + %384 = tail call float @llvm.nvvm.rsqrt.approx.f(float %344), !dbg !51 + br label %__nv_rsqrtf.exit71, !dbg !51 + +__nv_rsqrtf.exit71: ; preds = %381, %383 + %.0.i70 = phi float [ %382, %381 ], [ %384, %383 ], !dbg !51 + %385 = extractelement <2 x bfloat> %303, i64 1, !dbg !44 + %386 = fpext bfloat %385 to float, !dbg !44 + %387 = extractelement <2 x bfloat> %302, i64 1, !dbg !44 + %388 = fpext bfloat %387 to float, !dbg !44 + %389 = extractelement <2 x bfloat> %303, i64 0, !dbg !44 + %390 = fpext bfloat %389 to float, !dbg !44 + %391 = extractelement <2 x bfloat> %302, i64 0, !dbg !44 + %392 = fpext bfloat %391 to float, !dbg !44 + %393 = extractelement <2 x bfloat> %301, i64 1, !dbg !44 + %394 = fpext bfloat %393 to float, !dbg !44 + %395 = extractelement <2 x bfloat> %300, i64 1, !dbg !44 + %396 = fpext bfloat %395 to float, !dbg !44 + %397 = extractelement <2 x bfloat> %301, i64 0, !dbg !44 + %398 = fpext bfloat %397 to float, !dbg !44 + %399 = extractelement <2 x bfloat> %300, i64 0, !dbg !44 + %400 = fpext bfloat %399 to float, !dbg !44 + %401 = extractvalue { i32, i32, i32, i32 } %270, 3, !dbg !39 + %402 = bitcast i32 %401 to <2 x bfloat>, !dbg !39 + %403 = extractvalue { i32, i32, i32, i32 } %270, 2, !dbg !39 + %404 = bitcast i32 %403 to <2 x bfloat>, !dbg !39 + %405 = extractvalue { i32, i32, i32, i32 } %270, 1, !dbg !39 + %406 = bitcast i32 %405 to <2 x bfloat>, !dbg !39 + %407 = extractvalue { i32, i32, i32, i32 } %270, 0, !dbg !39 + %408 = bitcast i32 %407 to <2 x bfloat>, !dbg !39 + %409 = icmp slt i32 %23, 73728, !dbg !52 + %410 = fmul float %.0.i49, %400, !dbg !53 + %411 = fmul float %.0.i52, %398, !dbg !53 + %412 = fmul float %.0.i55, %396, !dbg !53 + %413 = fmul float %.0.i58, %394, !dbg !53 + %414 = fmul float %.0.i61, %392, !dbg !53 + %415 = fmul float %.0.i64, %390, !dbg !53 + %416 = fmul float %.0.i67, %388, !dbg !53 + %417 = fmul float %.0.i70, %386, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + store float %410, ptr addrspace(3) %227, align 4, !dbg !53 + store float %412, ptr addrspace(3) %228, align 4, !dbg !53 + store float %411, ptr addrspace(3) %230, align 4, !dbg !53 + store float %413, ptr addrspace(3) %231, align 4, !dbg !53 + store float %414, ptr addrspace(3) %233, align 4, !dbg !53 + store float %416, ptr addrspace(3) %234, align 4, !dbg !53 + store float %415, ptr addrspace(3) %236, align 4, !dbg !53 + store float %417, ptr addrspace(3) %237, align 4, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %418 = load float, ptr addrspace(3) %244, align 4, !dbg !53 + %419 = load float, ptr addrspace(3) %247, align 4, !dbg !53 + %420 = load float, ptr addrspace(3) %250, align 4, !dbg !53 + %421 = load float, ptr addrspace(3) %253, align 4, !dbg !53 + %422 = load float, ptr addrspace(3) %256, align 4, !dbg !53 + %423 = load float, ptr addrspace(3) %259, align 4, !dbg !53 + %424 = load float, ptr addrspace(3) %262, align 4, !dbg !53 + %425 = load float, ptr addrspace(3) %265, align 4, !dbg !53 + %426 = getelementptr bfloat, ptr addrspace(1) %5, i64 %267, !dbg !54 + %427 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !55 + %428 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %426, i64 %427, i1 %276) #5, !dbg !55 + %429 = extractvalue { i32, i32, i32, i32 } %428, 0, !dbg !55 + %430 = bitcast i32 %429 to <2 x bfloat>, !dbg !55 + %431 = extractvalue { i32, i32, i32, i32 } %428, 1, !dbg !55 + %432 = bitcast i32 %431 to <2 x bfloat>, !dbg !55 + %433 = extractvalue { i32, i32, i32, i32 } %428, 2, !dbg !55 + %434 = bitcast i32 %433 to <2 x bfloat>, !dbg !55 + %435 = extractvalue { i32, i32, i32, i32 } %428, 3, !dbg !55 + %436 = bitcast i32 %435 to <2 x bfloat>, !dbg !55 + %437 = shl i32 %23, 7, !dbg !56 + %438 = add i32 %437, %31, !dbg !57 + %439 = sext i32 %438 to i64, !dbg !58 + %440 = getelementptr bfloat, ptr addrspace(1) %6, i64 %439, !dbg !58 + %441 = and i1 %33, %409, !dbg !59 + %442 = fpext <2 x bfloat> %408 to <2 x float>, !dbg !60 + %443 = insertelement <2 x float> poison, float %245, i64 0, !dbg !61 + %444 = insertelement <2 x float> %443, float %248, i64 1, !dbg !61 + %445 = fmul <2 x float> %444, %442, !dbg !61 + %446 = fpext <2 x bfloat> %430 to <2 x float>, !dbg !62 + %447 = insertelement <2 x float> poison, float %418, i64 0, !dbg !63 + %448 = insertelement <2 x float> %447, float %419, i64 1, !dbg !63 + %449 = fmul <2 x float> %448, %446, !dbg !63 + %450 = insertelement <2 x i1> poison, i1 %39, i64 0, !dbg !64 + %451 = shufflevector <2 x i1> %450, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !64 + %452 = select <2 x i1> %451, <2 x float> %445, <2 x float> %449, !dbg !64 + %453 = fptrunc <2 x float> %452 to <2 x bfloat>, !dbg !65 + %454 = fpext <2 x bfloat> %406 to <2 x float>, !dbg !60 + %455 = insertelement <2 x float> poison, float %251, i64 0, !dbg !61 + %456 = insertelement <2 x float> %455, float %254, i64 1, !dbg !61 + %457 = fmul <2 x float> %456, %454, !dbg !61 + %458 = fpext <2 x bfloat> %432 to <2 x float>, !dbg !62 + %459 = insertelement <2 x float> poison, float %420, i64 0, !dbg !63 + %460 = insertelement <2 x float> %459, float %421, i64 1, !dbg !63 + %461 = fmul <2 x float> %460, %458, !dbg !63 + %462 = select <2 x i1> %451, <2 x float> %457, <2 x float> %461, !dbg !64 + %463 = fptrunc <2 x float> %462 to <2 x bfloat>, !dbg !65 + %464 = fpext <2 x bfloat> %404 to <2 x float>, !dbg !60 + %465 = insertelement <2 x float> poison, float %257, i64 0, !dbg !61 + %466 = insertelement <2 x float> %465, float %260, i64 1, !dbg !61 + %467 = fmul <2 x float> %466, %464, !dbg !61 + %468 = fpext <2 x bfloat> %434 to <2 x float>, !dbg !62 + %469 = insertelement <2 x float> poison, float %422, i64 0, !dbg !63 + %470 = insertelement <2 x float> %469, float %423, i64 1, !dbg !63 + %471 = fmul <2 x float> %470, %468, !dbg !63 + %472 = select <2 x i1> %451, <2 x float> %467, <2 x float> %471, !dbg !64 + %473 = fptrunc <2 x float> %472 to <2 x bfloat>, !dbg !65 + %474 = fpext <2 x bfloat> %402 to <2 x float>, !dbg !60 + %475 = insertelement <2 x float> poison, float %263, i64 0, !dbg !61 + %476 = insertelement <2 x float> %475, float %266, i64 1, !dbg !61 + %477 = fmul <2 x float> %476, %474, !dbg !61 + %478 = fpext <2 x bfloat> %436 to <2 x float>, !dbg !62 + %479 = insertelement <2 x float> poison, float %424, i64 0, !dbg !63 + %480 = insertelement <2 x float> %479, float %425, i64 1, !dbg !63 + %481 = fmul <2 x float> %480, %478, !dbg !63 + %482 = select <2 x i1> %451, <2 x float> %477, <2 x float> %481, !dbg !64 + %483 = fptrunc <2 x float> %482 to <2 x bfloat>, !dbg !65 + %484 = bitcast <2 x bfloat> %453 to i32, !dbg !65 + %485 = bitcast <2 x bfloat> %463 to i32, !dbg !65 + %486 = bitcast <2 x bfloat> %473 to i32, !dbg !65 + %487 = bitcast <2 x bfloat> %483 to i32, !dbg !65 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %484, i32 %485, i32 %486, i32 %487, ptr addrspace(1) %440, i1 %441) #5, !dbg !65 + ret void, !dbg !66 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #3 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 29, scope: !5) +!9 = !DILocation(line: 21, column: 48, scope: !5) +!10 = !DILocation(line: 21, column: 69, scope: !5) +!11 = !DILocation(line: 21, column: 53, scope: !5) +!12 = !DILocation(line: 21, column: 34, scope: !5) +!13 = !DILocation(line: 21, column: 75, scope: !5) +!14 = !DILocation(line: 22, column: 44, scope: !5) +!15 = !DILocation(line: 22, column: 23, scope: !5) +!16 = !DILocation(line: 24, column: 28, scope: !5) +!17 = !DILocation(line: 24, column: 33, scope: !5) +!18 = !DILocation(line: 25, column: 44, scope: !5) +!19 = !DILocation(line: 25, column: 23, scope: !5) +!20 = !DILocation(line: 26, column: 21, scope: !5) +!21 = !DILocation(line: 27, column: 19, scope: !5) +!22 = !DILocation(line: 29, column: 19, scope: !5) +!23 = !DILocation(line: 35, column: 18, scope: !5) +!24 = !DILocation(line: 36, column: 39, scope: !5) +!25 = !DILocation(line: 36, column: 35, scope: !5) +!26 = !DILocation(line: 36, column: 51, scope: !5) +!27 = !DILocation(line: 36, column: 44, scope: !5) +!28 = !DILocation(line: 36, column: 30, scope: !5) +!29 = !DILocation(line: 36, column: 64, scope: !5) +!30 = !DILocation(line: 36, column: 57, scope: !5) +!31 = !DILocation(line: 36, column: 123, scope: !5) +!32 = !DILocation(line: 38, column: 30, scope: !5) +!33 = !DILocation(line: 38, column: 80, scope: !5) +!34 = !DILocation(line: 40, column: 19, scope: !5) +!35 = !DILocation(line: 42, column: 19, scope: !5) +!36 = !DILocation(line: 43, column: 28, scope: !5) +!37 = !DILocation(line: 44, column: 19, scope: !5) +!38 = !DILocation(line: 45, column: 31, scope: !5) +!39 = !DILocation(line: 45, column: 71, scope: !5) +!40 = !DILocation(line: 54, column: 45, scope: !5) +!41 = !DILocation(line: 54, column: 31, scope: !5) +!42 = !DILocation(line: 54, column: 83, scope: !5) +!43 = !DILocation(line: 54, column: 67, scope: !5) +!44 = !DILocation(line: 54, column: 134, scope: !5) +!45 = !DILocation(line: 56, column: 56, scope: !5) +!46 = !DILocation(line: 56, column: 52, scope: !5) +!47 = !DILocation(line: 56, column: 31, scope: !5) +!48 = !DILocation(line: 56, column: 90, scope: !5) +!49 = !DILocation(line: 58, column: 21, scope: !5) +!50 = !DILocation(line: 60, column: 20, scope: !5) +!51 = !DILocation(line: 61, column: 28, scope: !5) +!52 = !DILocation(line: 23, column: 21, scope: !5) +!53 = !DILocation(line: 62, column: 20, scope: !5) +!54 = !DILocation(line: 63, column: 31, scope: !5) +!55 = !DILocation(line: 63, column: 71, scope: !5) +!56 = !DILocation(line: 70, column: 34, scope: !5) +!57 = !DILocation(line: 70, column: 30, scope: !5) +!58 = !DILocation(line: 70, column: 25, scope: !5) +!59 = !DILocation(line: 70, column: 54, scope: !5) +!60 = !DILocation(line: 45, column: 137, scope: !5) +!61 = !DILocation(line: 47, column: 20, scope: !5) +!62 = !DILocation(line: 63, column: 138, scope: !5) +!63 = !DILocation(line: 65, column: 20, scope: !5) +!64 = !DILocation(line: 0, scope: !5) +!65 = !DILocation(line: 70, column: 46, scope: !5) +!66 = !DILocation(line: 70, column: 4, scope: !5) diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..30e6e10414788c65e78dc51c11522a91f063f5a1 --- /dev/null +++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx @@ -0,0 +1,794 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_poi_fused__fused_rms_norm_cat_view_2 +.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2( + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10 +) +.reqntid 128 +{ + .reg .pred %p<12>; + .reg .b16 %rs<33>; + .reg .b32 %r<295>; + .reg .b64 %rd<24>; + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd16, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0]; + ld.param.b64 %rd17, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1]; +$L__tmp0: + .loc 1 21 29 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29 + mov.u32 %r38, %ctaid.y; + ld.param.b64 %rd18, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2]; + .loc 1 21 48 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48 + mov.u32 %r39, %ctaid.z; + ld.param.b64 %rd19, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3]; + .loc 1 21 69 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69 + mov.u32 %r40, %nctaid.y; + ld.param.b64 %rd20, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4]; + .loc 1 21 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34 + mad.lo.s32 %r41, %r39, %r40, %r38; + ld.param.b64 %rd21, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5]; + .loc 1 21 75 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75 + shl.b32 %r42, %r41, 3; + ld.param.b64 %rd22, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6]; + .loc 1 22 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44 + mov.u32 %r43, %tid.x; + and.b32 %r44, %r43, 112; + bfe.u32 %r45, %r43, 4, 3; + and.b32 %r46, %r43, 1; + shl.b32 %r47, %r46, 2; + .loc 1 22 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23 + or.b32 %r48, %r42, %r45; + or.b32 %r49, %r42, %r47; + .loc 1 24 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28 + mov.u32 %r50, %ctaid.x; + .loc 1 24 33 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:33 + shl.b32 %r51, %r50, 7; + .loc 1 25 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:44 + shl.b32 %r52, %r43, 3; + and.b32 %r53, %r52, 120; + shr.u32 %r54, %r43, 1; + bfe.u32 %r55, %r43, 1, 6; + .loc 1 25 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:23 + or.b32 %r56, %r53, %r51; + or.b32 %r57, %r55, %r51; + .loc 1 26 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21 + setp.lt.s32 %p6, %r56, 128; + setp.lt.s32 %p7, %r57, 128; + .loc 1 27 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19 + bfe.s32 %r58, %r41, 28, 1; + shr.u32 %r59, %r58, 27; + add.s32 %r60, %r48, %r59; + shr.u32 %r61, %r60, 5; + .loc 1 29 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19 + and.b32 %r62, %r60, 33554400; + sub.s32 %r63, %r48, %r62; + .loc 1 35 18 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18 + setp.lt.s32 %p8, %r48, 8192; + setp.lt.s32 %p9, %r49, 8192; + .loc 1 36 39 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39 + shl.b32 %r64, %r63, 7; + .loc 1 36 35 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35 + add.s32 %r65, %r64, %r56; + .loc 1 36 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44 + mad.lo.s32 %r66, %r61, 12288, %r65; + .loc 1 36 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30 + mad.wide.s32 %rd1, %r66, 2, %rd16; + .loc 1 36 64 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64 + and.pred %p1, %p6, %p8; + and.pred %p2, %p7, %p9; + .loc 1 36 57 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + shr.u32 %r67, %r1, 16; + shr.u32 %r68, %r2, 16; + shr.u32 %r69, %r3, 16; + shr.u32 %r70, %r4, 16; + .loc 1 36 123 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123 + shl.b32 %r71, %r43, 4; + and.b32 %r72, %r71, 112; + bfe.s32 %r73, %r43, 3, 1; + and.b32 %r74, %r43, 8; + shr.u32 %r75, %r74, 1; + bfe.s32 %r76, %r43, 4, 1; + and.b32 %r77, %r76, 136; + and.b32 %r78, %r43, 32; + shr.u32 %r79, %r78, 4; + shr.u32 %r80, %r43, 3; + and.b32 %r81, %r80, 8; + or.b32 %r82, %r75, %r79; + or.b32 %r83, %r77, %r72; + xor.b32 %r84, %r83, %r81; + or.b32 %r85, %r82, %r84; + mov.b32 %r86, global_smem; + add.s32 %r87, %r86, %r85; + st.shared.b16 [%r87], %r1; + xor.b32 %r88, %r85, 32; + add.s32 %r89, %r86, %r88; + st.shared.b16 [%r89+256], %r67; + xor.b32 %r90, %r85, 64; + add.s32 %r91, %r86, %r90; + st.shared.b16 [%r91+512], %r2; + xor.b32 %r92, %r85, 96; + add.s32 %r93, %r86, %r92; + st.shared.b16 [%r93+768], %r68; + xor.b32 %r94, %r85, 4; + add.s32 %r95, %r86, %r94; + st.shared.b16 [%r95+1024], %r3; + xor.b32 %r96, %r85, 36; + add.s32 %r97, %r86, %r96; + st.shared.b16 [%r97+1280], %r69; + xor.b32 %r98, %r85, 68; + add.s32 %r99, %r86, %r98; + st.shared.b16 [%r99+1536], %r4; + xor.b32 %r100, %r85, 100; + add.s32 %r101, %r86, %r100; + st.shared.b16 [%r101+1792], %r70; + bar.sync 0; + and.b32 %r102, %r43, 6; + shl.b32 %r103, %r46, 3; + and.b32 %r104, %r73, 1028; + mul.lo.s32 %r105, %r102, 144; + xor.b32 %r106, %r105, %r44; + or.b32 %r107, %r104, %r106; + or.b32 %r108, %r107, %r103; + add.s32 %r109, %r86, %r108; + ld.shared.v2.b16 {%rs1, %rs2}, [%r109]; + xor.b32 %r110, %r108, 136; + add.s32 %r111, %r86, %r110; + ld.shared.v2.b16 {%rs3, %rs4}, [%r111]; + xor.b32 %r112, %r108, 4; + add.s32 %r113, %r86, %r112; + ld.shared.v2.b16 {%rs5, %rs6}, [%r113]; + xor.b32 %r114, %r108, 140; + add.s32 %r115, %r86, %r114; + ld.shared.v2.b16 {%rs7, %rs8}, [%r115]; + cvt.f32.bf16 %r116, %rs1; + cvt.f32.bf16 %r117, %rs3; + cvt.f32.bf16 %r118, %rs2; + cvt.f32.bf16 %r119, %rs4; + cvt.f32.bf16 %r120, %rs5; + cvt.f32.bf16 %r121, %rs7; + cvt.f32.bf16 %r122, %rs6; + cvt.f32.bf16 %r123, %rs8; + .loc 1 38 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30 + mad.wide.s32 %rd3, %r49, 4, %rd17; + .loc 1 38 80 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4; + // end inline asm + // begin inline asm + mov.u64 %rd5, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd5, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + mov.u32 %r13, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd3 + 0 ], %rd5; + // end inline asm + mov.b32 %r124, 0f43000000; + .loc 1 40 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19 + div.full.f32 %r125, %r6, %r124; + div.full.f32 %r126, %r7, %r124; + div.full.f32 %r127, %r8, %r124; + div.full.f32 %r128, %r9, %r124; + div.full.f32 %r129, %r10, %r124; + div.full.f32 %r130, %r11, %r124; + div.full.f32 %r131, %r12, %r124; + div.full.f32 %r132, %r13, %r124; + .loc 1 42 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19 + add.f32 %r133, %r125, 0f358637BD; + add.f32 %r134, %r126, 0f358637BD; + add.f32 %r135, %r127, 0f358637BD; + add.f32 %r136, %r128, 0f358637BD; + add.f32 %r137, %r129, 0f358637BD; + add.f32 %r138, %r130, 0f358637BD; + add.f32 %r139, %r131, 0f358637BD; + add.f32 %r140, %r132, 0f358637BD; + .loc 1 43 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28 + rsqrt.approx.ftz.f32 %r141, %r133; + rsqrt.approx.ftz.f32 %r142, %r134; + rsqrt.approx.ftz.f32 %r143, %r135; + rsqrt.approx.ftz.f32 %r144, %r136; + rsqrt.approx.ftz.f32 %r145, %r137; + rsqrt.approx.ftz.f32 %r146, %r138; + rsqrt.approx.ftz.f32 %r147, %r139; + rsqrt.approx.ftz.f32 %r148, %r140; + .loc 1 44 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19 + mul.f32 %r149, %r141, %r116; + mul.f32 %r150, %r142, %r117; + mul.f32 %r151, %r143, %r118; + mul.f32 %r152, %r144, %r119; + mul.f32 %r153, %r145, %r120; + mul.f32 %r154, %r146, %r121; + mul.f32 %r155, %r147, %r122; + mul.f32 %r156, %r148, %r123; + bar.sync 0; + and.b32 %r157, %r73, 2052; + mul.lo.s32 %r158, %r102, 272; + xor.b32 %r159, %r158, %r44; + or.b32 %r160, %r157, %r159; + or.b32 %r161, %r160, %r103; + add.s32 %r162, %r86, %r161; + st.shared.b32 [%r162], %r149; + st.shared.b32 [%r162+128], %r151; + xor.b32 %r163, %r161, 264; + add.s32 %r164, %r86, %r163; + st.shared.b32 [%r164], %r150; + st.shared.b32 [%r164+128], %r152; + xor.b32 %r165, %r161, 4; + add.s32 %r166, %r86, %r165; + st.shared.b32 [%r166], %r153; + st.shared.b32 [%r166+128], %r155; + xor.b32 %r167, %r161, 268; + add.s32 %r168, %r86, %r167; + st.shared.b32 [%r168], %r154; + st.shared.b32 [%r168+128], %r156; + bar.sync 0; + and.b32 %r169, %r71, 368; + and.b32 %r170, %r54, 12; + shl.b32 %r171, %r78, 2; + or.b32 %r172, %r169, %r170; + xor.b32 %r173, %r172, %r81; + or.b32 %r174, %r173, %r171; + add.s32 %r175, %r86, %r174; + ld.shared.b32 %r176, [%r175]; + xor.b32 %r177, %r174, 32; + add.s32 %r178, %r86, %r177; + ld.shared.b32 %r179, [%r178+512]; + xor.b32 %r180, %r174, 64; + add.s32 %r181, %r86, %r180; + ld.shared.b32 %r182, [%r181+1024]; + xor.b32 %r183, %r174, 96; + add.s32 %r184, %r86, %r183; + ld.shared.b32 %r185, [%r184+1536]; + xor.b32 %r186, %r174, 4; + add.s32 %r187, %r86, %r186; + ld.shared.b32 %r188, [%r187+2048]; + xor.b32 %r189, %r174, 36; + add.s32 %r190, %r86, %r189; + ld.shared.b32 %r191, [%r190+2560]; + xor.b32 %r192, %r174, 68; + add.s32 %r193, %r86, %r192; + ld.shared.b32 %r194, [%r193+3072]; + xor.b32 %r195, %r174, 100; + add.s32 %r196, %r86, %r195; + ld.shared.b32 %r197, [%r196+3584]; + .loc 1 45 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31 + mul.wide.s32 %rd23, %r56, 2; + add.s64 %rd6, %rd18, %rd23; + .loc 1 45 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71 + // begin inline asm + mov.u64 %rd7, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r14, %r5; + mov.u32 %r15, %r5; + mov.u32 %r16, %r5; + mov.u32 %r17, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd6 + 0 ], %rd7; + // end inline asm + .loc 1 54 45 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45 + add.s32 %r198, %r66, -3145728; + .loc 1 54 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31 + mad.wide.s32 %rd8, %r198, 2, %rd19; + .loc 1 54 83 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83 + add.s32 %r199, %r42, -8192; + setp.lt.u32 %p10, %r199, 65536; + and.pred %p3, %p6, %p10; + and.pred %p4, %p7, %p10; + .loc 1 54 67 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67 + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r18, %r5; + mov.u32 %r19, %r5; + mov.u32 %r20, %r5; + mov.u32 %r21, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd8 + 0 ], %rd9; + // end inline asm + shr.u32 %r200, %r18, 16; + shr.u32 %r201, %r19, 16; + shr.u32 %r202, %r20, 16; + shr.u32 %r203, %r21, 16; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + bar.sync 0; + st.shared.b16 [%r87], %r18; + st.shared.b16 [%r89+256], %r200; + st.shared.b16 [%r91+512], %r19; + st.shared.b16 [%r93+768], %r201; + st.shared.b16 [%r95+1024], %r20; + st.shared.b16 [%r97+1280], %r202; + st.shared.b16 [%r99+1536], %r21; + st.shared.b16 [%r101+1792], %r203; + bar.sync 0; + ld.shared.v2.b16 {%rs9, %rs10}, [%r109]; + ld.shared.v2.b16 {%rs11, %rs12}, [%r111]; + ld.shared.v2.b16 {%rs13, %rs14}, [%r113]; + ld.shared.v2.b16 {%rs15, %rs16}, [%r115]; + .loc 1 56 52 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52 + add.s32 %r204, %r49, -8192; + .loc 1 56 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31 + mad.wide.s32 %rd10, %r204, 4, %rd20; + .loc 1 56 90 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r22, %r5; + mov.u32 %r23, %r5; + mov.u32 %r24, %r5; + mov.u32 %r25, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r22, %r23, %r24, %r25 }, [ %rd10 + 0 ], %rd11; + // end inline asm + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + mov.u32 %r29, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd10 + 0 ], %rd12; + // end inline asm + .loc 1 58 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21 + div.full.f32 %r205, %r22, %r124; + div.full.f32 %r206, %r23, %r124; + div.full.f32 %r207, %r24, %r124; + div.full.f32 %r208, %r25, %r124; + div.full.f32 %r209, %r26, %r124; + div.full.f32 %r210, %r27, %r124; + div.full.f32 %r211, %r28, %r124; + div.full.f32 %r212, %r29, %r124; + .loc 1 60 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20 + add.f32 %r213, %r205, 0f358637BD; + add.f32 %r214, %r206, 0f358637BD; + add.f32 %r215, %r207, 0f358637BD; + add.f32 %r216, %r208, 0f358637BD; + add.f32 %r217, %r209, 0f358637BD; + add.f32 %r218, %r210, 0f358637BD; + add.f32 %r219, %r211, 0f358637BD; + add.f32 %r220, %r212, 0f358637BD; + .loc 1 61 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28 + rsqrt.approx.ftz.f32 %r221, %r213; + rsqrt.approx.ftz.f32 %r222, %r214; + rsqrt.approx.ftz.f32 %r223, %r215; + rsqrt.approx.ftz.f32 %r224, %r216; + rsqrt.approx.ftz.f32 %r225, %r217; + rsqrt.approx.ftz.f32 %r226, %r218; + rsqrt.approx.ftz.f32 %r227, %r219; + rsqrt.approx.ftz.f32 %r228, %r220; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + cvt.f32.bf16 %r229, %rs16; + cvt.f32.bf16 %r230, %rs14; + cvt.f32.bf16 %r231, %rs15; + cvt.f32.bf16 %r232, %rs13; + cvt.f32.bf16 %r233, %rs12; + cvt.f32.bf16 %r234, %rs10; + cvt.f32.bf16 %r235, %rs11; + cvt.f32.bf16 %r236, %rs9; + .loc 1 23 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21 + setp.lt.s32 %p11, %r48, 73728; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r237, %r221, %r236; + mul.f32 %r238, %r222, %r235; + mul.f32 %r239, %r223, %r234; + mul.f32 %r240, %r224, %r233; + mul.f32 %r241, %r225, %r232; + mul.f32 %r242, %r226, %r231; + mul.f32 %r243, %r227, %r230; + mul.f32 %r244, %r228, %r229; + bar.sync 0; + st.shared.b32 [%r162], %r237; + st.shared.b32 [%r162+128], %r239; + st.shared.b32 [%r164], %r238; + st.shared.b32 [%r164+128], %r240; + st.shared.b32 [%r166], %r241; + st.shared.b32 [%r166+128], %r243; + st.shared.b32 [%r168], %r242; + st.shared.b32 [%r168+128], %r244; + bar.sync 0; + ld.shared.b32 %r245, [%r175]; + ld.shared.b32 %r246, [%r178+512]; + ld.shared.b32 %r247, [%r181+1024]; + ld.shared.b32 %r248, [%r184+1536]; + ld.shared.b32 %r249, [%r187+2048]; + ld.shared.b32 %r250, [%r190+2560]; + ld.shared.b32 %r251, [%r193+3072]; + ld.shared.b32 %r252, [%r196+3584]; + .loc 1 63 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31 + add.s64 %rd13, %rd21, %rd23; + .loc 1 63 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71 + // begin inline asm + mov.u64 %rd14, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r30, %r5; + mov.u32 %r31, %r5; + mov.u32 %r32, %r5; + mov.u32 %r33, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd13 + 0 ], %rd14; + // end inline asm + .loc 1 70 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34 + shl.b32 %r253, %r48, 7; + .loc 1 70 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30 + add.s32 %r254, %r253, %r56; + .loc 1 70 25 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25 + mad.wide.s32 %rd15, %r254, 2, %rd22; + .loc 1 70 54 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54 + and.pred %p5, %p6, %p11; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs17, %rs18}, %r14; + cvt.f32.bf16 %r255, %rs17; + cvt.f32.bf16 %r256, %rs18; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r257, %r179, %r256; + mul.f32 %r258, %r176, %r255; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs19, %rs20}, %r30; + cvt.f32.bf16 %r259, %rs19; + cvt.f32.bf16 %r260, %rs20; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r261, %r246, %r260; + mul.f32 %r262, %r245, %r259; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r263, %r258, %r262, %p8; + selp.f32 %r264, %r257, %r261, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r34, %r264, %r263; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs21, %rs22}, %r15; + cvt.f32.bf16 %r265, %rs21; + cvt.f32.bf16 %r266, %rs22; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r267, %r185, %r266; + mul.f32 %r268, %r182, %r265; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs23, %rs24}, %r31; + cvt.f32.bf16 %r269, %rs23; + cvt.f32.bf16 %r270, %rs24; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r271, %r248, %r270; + mul.f32 %r272, %r247, %r269; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r273, %r268, %r272, %p8; + selp.f32 %r274, %r267, %r271, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r35, %r274, %r273; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs25, %rs26}, %r16; + cvt.f32.bf16 %r275, %rs25; + cvt.f32.bf16 %r276, %rs26; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r277, %r191, %r276; + mul.f32 %r278, %r188, %r275; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs27, %rs28}, %r32; + cvt.f32.bf16 %r279, %rs27; + cvt.f32.bf16 %r280, %rs28; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r281, %r250, %r280; + mul.f32 %r282, %r249, %r279; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r283, %r278, %r282, %p8; + selp.f32 %r284, %r277, %r281, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r36, %r284, %r283; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs29, %rs30}, %r17; + cvt.f32.bf16 %r285, %rs29; + cvt.f32.bf16 %r286, %rs30; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r287, %r197, %r286; + mul.f32 %r288, %r194, %r285; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs31, %rs32}, %r33; + cvt.f32.bf16 %r289, %rs31; + cvt.f32.bf16 %r290, %rs32; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r291, %r252, %r290; + mul.f32 %r292, %r251, %r289; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r293, %r288, %r292, %p8; + selp.f32 %r294, %r287, %r291, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r37, %r294, %r293; + // begin inline asm + @%p5 st.global.v4.b32 [ %rd15 + 0 ], { %r34, %r35, %r36, %r37 }; + // end inline asm + .loc 1 70 4 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 50 +.b8 104 +.b8 105 +.b8 106 +.b8 51 +.b8 104 +.b8 109 +.b8 108 +.b8 111 +.b8 117 +.b8 109 +.b8 120 +.b8 100 +.b8 109 +.b8 104 +.b8 117 +.b8 101 +.b8 122 +.b8 115 +.b8 121 +.b8 104 +.b8 107 +.b8 109 +.b8 110 +.b8 113 +.b8 103 +.b8 110 +.b8 102 +.b8 97 +.b8 53 +.b8 105 +.b8 118 +.b8 114 +.b8 101 +.b8 50 +.b8 55 +.b8 117 +.b8 111 +.b8 115 +.b8 121 +.b8 109 +.b8 97 +.b8 109 +.b8 51 +.b8 100 +.b8 114 +.b8 55 +.b8 97 +.b8 53 +.b8 120 +.b8 98 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 50 +.b8 104 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.source new file mode 100644 index 0000000000000000000000000000000000000000..fb33074b29fb0dac06ff56ab95cbae14db960018 --- /dev/null +++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.source @@ -0,0 +1,415 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc99 = loc("in_ptr0"(#loc)) +#loc100 = loc("in_ptr1"(#loc)) +#loc101 = loc("in_ptr2"(#loc)) +#loc102 = loc("in_ptr3"(#loc)) +#loc103 = loc("in_ptr4"(#loc)) +#loc104 = loc("in_ptr5"(#loc)) +#loc105 = loc("out_ptr0"(#loc)) +#loc106 = loc("ynumel"(#loc)) +#loc107 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %ynumel_0 = arith.constant 73728 : i32 loc(#loc108) + %xnumel_1 = arith.constant 128 : i32 loc(#loc109) + %yoffset = tt.get_program_id y : i32 loc(#loc110) + %yoffset_2 = tt.get_program_id z : i32 loc(#loc111) + %yoffset_3 = tt.get_num_programs y : i32 loc(#loc112) + %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc113) + %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc114) + %yoffset_6 = arith.constant 8 : i32 loc(#loc115) + %yoffset_7 = arith.constant 8 : i32 loc(#loc115) + %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc115) + %yindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc116) + %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc117) + %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<8x1xi32> loc(#loc118) + %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<8x1xi32> loc(#loc118) + %ymask = arith.constant dense<73728> : tensor<8x1xi32> loc(#loc119) + %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<8x1xi32> loc(#loc119) + %xoffset = tt.get_program_id x : i32 loc(#loc120) + %xoffset_13 = arith.constant 128 : i32 loc(#loc121) + %xoffset_14 = arith.constant 128 : i32 loc(#loc121) + %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc121) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc122) + %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc123) + %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x128xi32> loc(#loc124) + %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x128xi32> loc(#loc124) + %xmask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc125) + %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x128xi32> loc(#loc125) + %y1 = arith.constant 32 : i32 loc(#loc126) + %y1_20 = arith.constant 32 : i32 loc(#loc126) + %y1_21 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc126) + %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<8x1xi32> loc(#loc126) + %y0 = arith.constant 32 : i32 loc(#loc127) + %y0_23 = arith.constant 32 : i32 loc(#loc127) + %y0_24 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc127) + %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<8x1xi32> loc(#loc127) + %tmp1 = arith.constant 0 : i64 loc(#loc128) + %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc128) + %tmp2 = arith.extsi %y1_22 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc129) + %tmp2_27 = arith.constant dense<0> : tensor<8x1xi64> loc(#loc129) + %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<8x1xi64> loc(#loc129) + %tmp3 = arith.constant 256 : i64 loc(#loc130) + %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc130) + %tmp4 = arith.extsi %y1_22 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc131) + %tmp4_30 = arith.constant dense<256> : tensor<8x1xi64> loc(#loc131) + %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<8x1xi64> loc(#loc131) + %tmp5 = arith.constant 128 : i32 loc(#loc132) + %tmp5_32 = arith.constant 128 : i32 loc(#loc132) + %tmp5_33 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc132) + %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<8x1xi32> loc(#loc132) + %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc133) + %tmp5_36 = tt.broadcast %tmp5_34 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc133) + %tmp5_37 = arith.addi %tmp5_35, %tmp5_36 : tensor<8x128xi32> loc(#loc133) + %tmp5_38 = arith.constant 12288 : i32 loc(#loc134) + %tmp5_39 = arith.constant 12288 : i32 loc(#loc134) + %tmp5_40 = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc134) + %tmp5_41 = arith.muli %tmp5_40, %y1_22 : tensor<8x1xi32> loc(#loc134) + %tmp5_42 = tt.broadcast %tmp5_41 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc135) + %tmp5_43 = arith.addi %tmp5_37, %tmp5_42 : tensor<8x128xi32> loc(#loc135) + %tmp5_44 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc136) + %tmp5_45 = tt.addptr %tmp5_44, %tmp5_43 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc136) + %tmp5_46 = tt.broadcast %tmp4_31 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc137) + %tmp5_47 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc137) + %tmp5_48 = arith.andi %tmp5_46, %tmp5_47 : tensor<8x128xi1> loc(#loc137) + %tmp5_49 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc138) + %tmp5_50 = arith.andi %tmp5_48, %tmp5_49 : tensor<8x128xi1> loc(#loc138) + %tmp5_51 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp5_52 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc139) + %tmp5_53 = arith.truncf %tmp5_52 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc139) + %tmp5_54 = tt.load %tmp5_45, %tmp5_50, %tmp5_53 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc139) + %tmp5_55 = arith.extf %tmp5_54 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc140) + %tmp7 = arith.constant 32 : i32 loc(#loc141) + %tmp7_56 = arith.constant 32 : i32 loc(#loc141) + %tmp7_57 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc141) + %tmp7_58 = arith.muli %tmp7_57, %y1_22 : tensor<8x1xi32> loc(#loc141) + %tmp7_59 = arith.addi %y0_25, %tmp7_58 : tensor<8x1xi32> loc(#loc142) + %tmp7_60 = tt.broadcast %tmp7_59 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc143) + %tmp7_61 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc144) + %tmp7_62 = tt.addptr %tmp7_61, %tmp7_60 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc144) + %tmp7_63 = tt.broadcast %tmp4_31 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc145) + %tmp7_64 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc145) + %tmp7_65 = arith.andi %tmp7_63, %tmp7_64 : tensor<8x128xi1> loc(#loc145) + %tmp7_66 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc146) + %tmp7_67 = arith.andi %tmp7_65, %tmp7_66 : tensor<8x128xi1> loc(#loc146) + %tmp7_68 = arith.constant 0.000000e+00 : f32 loc(#loc147) + %tmp7_69 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc147) + %tmp7_70 = tt.load %tmp7_62, %tmp7_67, %tmp7_69 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc147) + %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc148) + %tmp9 = arith.constant dense<1.280000e+02> : tensor<8x128xf32> loc(#loc149) + %tmp9_71 = arith.divf %tmp7_70, %tmp9 : tensor<8x128xf32> loc(#loc149) + %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc150) + %tmp11 = arith.constant dense<9.99999997E-7> : tensor<8x128xf32> loc(#loc151) + %tmp11_72 = arith.addf %tmp9_71, %tmp11 : tensor<8x128xf32> loc(#loc151) + %tmp12 = tt.extern_elementwise %tmp11_72 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32>) -> tensor<8x128xf32> loc(#loc152) + %tmp13 = arith.mulf %tmp5_55, %tmp12 : tensor<8x128xf32> loc(#loc153) + %tmp14 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc154) + %tmp14_73 = tt.splat %in_ptr2 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc155) + %tmp14_74 = tt.addptr %tmp14_73, %tmp14 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc155) + %tmp14_75 = tt.broadcast %tmp4_31 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc156) + %tmp14_76 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc156) + %tmp14_77 = arith.andi %tmp14_75, %tmp14_76 : tensor<8x128xi1> loc(#loc156) + %tmp14_78 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc157) + %tmp14_79 = arith.andi %tmp14_77, %tmp14_78 : tensor<8x128xi1> loc(#loc157) + %tmp14_80 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp14_81 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc158) + %tmp14_82 = arith.truncf %tmp14_81 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc158) + %tmp14_83 = tt.load %tmp14_74, %tmp14_79, %tmp14_82 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc158) + %tmp14_84 = arith.extf %tmp14_83 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc159) + %tmp16 = arith.mulf %tmp13, %tmp14_84 : tensor<8x128xf32> loc(#loc160) + %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc161) + %tmp18_85 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc161) + %tmp19 = tt.broadcast %tmp4_31 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc162) + %tmp19_86 = arith.select %tmp19, %tmp16, %tmp18_85 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc162) + %tmp20 = arith.extsi %y1_22 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc163) + %tmp20_87 = arith.constant dense<256> : tensor<8x1xi64> loc(#loc163) + %tmp20_88 = arith.cmpi sge, %tmp20, %tmp20_87 : tensor<8x1xi64> loc(#loc163) + %tmp21 = arith.constant 2304 : i64 loc(#loc164) + %tmp21_89 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc164) + %tmp22 = arith.extsi %y1_22 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc165) + %tmp22_90 = arith.constant dense<2304> : tensor<8x1xi64> loc(#loc165) + %tmp22_91 = arith.cmpi slt, %tmp22, %tmp22_90 : tensor<8x1xi64> loc(#loc165) + %tmp23 = arith.constant 128 : i32 loc(#loc166) + %tmp23_92 = arith.constant 128 : i32 loc(#loc166) + %tmp23_93 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc166) + %tmp23_94 = arith.muli %tmp23_93, %y0_25 : tensor<8x1xi32> loc(#loc166) + %tmp23_95 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc167) + %tmp23_96 = tt.broadcast %tmp23_94 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc167) + %tmp23_97 = arith.addi %tmp23_95, %tmp23_96 : tensor<8x128xi32> loc(#loc167) + %tmp23_98 = arith.constant -256 : i32 loc(#loc168) + %tmp23_99 = arith.constant -256 : i32 loc(#loc168) + %tmp23_100 = arith.constant dense<-256> : tensor<8x1xi32> loc(#loc168) + %tmp23_101 = arith.addi %tmp23_100, %y1_22 : tensor<8x1xi32> loc(#loc168) + %tmp23_102 = arith.constant 12288 : i32 loc(#loc169) + %tmp23_103 = arith.constant 12288 : i32 loc(#loc169) + %tmp23_104 = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc169) + %tmp23_105 = arith.muli %tmp23_104, %tmp23_101 : tensor<8x1xi32> loc(#loc169) + %tmp23_106 = tt.broadcast %tmp23_105 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc170) + %tmp23_107 = arith.addi %tmp23_97, %tmp23_106 : tensor<8x128xi32> loc(#loc170) + %tmp23_108 = tt.splat %in_ptr3 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc171) + %tmp23_109 = tt.addptr %tmp23_108, %tmp23_107 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc171) + %tmp23_110 = tt.broadcast %tmp20_88 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc172) + %tmp23_111 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc172) + %tmp23_112 = arith.andi %tmp23_110, %tmp23_111 : tensor<8x128xi1> loc(#loc172) + %tmp23_113 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc173) + %tmp23_114 = arith.andi %tmp23_112, %tmp23_113 : tensor<8x128xi1> loc(#loc173) + %tmp23_115 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %tmp23_116 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc174) + %tmp23_117 = arith.truncf %tmp23_116 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc174) + %tmp23_118 = tt.load %tmp23_109, %tmp23_114, %tmp23_117 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc174) + %tmp23_119 = arith.extf %tmp23_118 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc175) + %tmp25 = arith.constant -256 : i32 loc(#loc176) + %tmp25_120 = arith.constant -256 : i32 loc(#loc176) + %tmp25_121 = arith.constant dense<-256> : tensor<8x1xi32> loc(#loc176) + %tmp25_122 = arith.addi %tmp25_121, %y1_22 : tensor<8x1xi32> loc(#loc176) + %tmp25_123 = arith.constant 32 : i32 loc(#loc177) + %tmp25_124 = arith.constant 32 : i32 loc(#loc177) + %tmp25_125 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc177) + %tmp25_126 = arith.muli %tmp25_125, %tmp25_122 : tensor<8x1xi32> loc(#loc177) + %tmp25_127 = arith.addi %y0_25, %tmp25_126 : tensor<8x1xi32> loc(#loc178) + %tmp25_128 = tt.broadcast %tmp25_127 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc179) + %tmp25_129 = tt.splat %in_ptr4 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc180) + %tmp25_130 = tt.addptr %tmp25_129, %tmp25_128 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc180) + %tmp25_131 = tt.broadcast %tmp20_88 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc181) + %tmp25_132 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc181) + %tmp25_133 = arith.andi %tmp25_131, %tmp25_132 : tensor<8x128xi1> loc(#loc181) + %tmp25_134 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc182) + %tmp25_135 = arith.andi %tmp25_133, %tmp25_134 : tensor<8x128xi1> loc(#loc182) + %tmp25_136 = arith.constant 0.000000e+00 : f32 loc(#loc183) + %tmp25_137 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc183) + %tmp25_138 = tt.load %tmp25_130, %tmp25_135, %tmp25_137 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc183) + %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc184) + %tmp27 = arith.constant dense<1.280000e+02> : tensor<8x128xf32> loc(#loc185) + %tmp27_139 = arith.divf %tmp25_138, %tmp27 : tensor<8x128xf32> loc(#loc185) + %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc186) + %tmp29 = arith.constant dense<9.99999997E-7> : tensor<8x128xf32> loc(#loc187) + %tmp29_140 = arith.addf %tmp27_139, %tmp29 : tensor<8x128xf32> loc(#loc187) + %tmp30 = tt.extern_elementwise %tmp29_140 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32>) -> tensor<8x128xf32> loc(#loc188) + %tmp31 = arith.mulf %tmp23_119, %tmp30 : tensor<8x128xf32> loc(#loc189) + %tmp32 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc190) + %tmp32_141 = tt.splat %in_ptr5 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc191) + %tmp32_142 = tt.addptr %tmp32_141, %tmp32 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc191) + %tmp32_143 = tt.broadcast %tmp20_88 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc192) + %tmp32_144 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc192) + %tmp32_145 = arith.andi %tmp32_143, %tmp32_144 : tensor<8x128xi1> loc(#loc192) + %tmp32_146 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc193) + %tmp32_147 = arith.andi %tmp32_145, %tmp32_146 : tensor<8x128xi1> loc(#loc193) + %tmp32_148 = arith.constant 0.000000e+00 : f32 loc(#loc194) + %tmp32_149 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc194) + %tmp32_150 = arith.truncf %tmp32_149 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc194) + %tmp32_151 = tt.load %tmp32_142, %tmp32_147, %tmp32_150 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc194) + %tmp32_152 = arith.extf %tmp32_151 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc195) + %tmp34 = arith.mulf %tmp31, %tmp32_152 : tensor<8x128xf32> loc(#loc196) + %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc197) + %tmp36_153 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc197) + %tmp37 = tt.broadcast %tmp20_88 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc198) + %tmp37_154 = arith.select %tmp37, %tmp34, %tmp36_153 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc198) + %tmp38 = tt.broadcast %tmp4_31 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc199) + %tmp38_155 = arith.select %tmp38, %tmp19_86, %tmp37_154 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc199) + %c128_i32 = arith.constant 128 : i32 loc(#loc93) + %c128_i32_156 = arith.constant 128 : i32 loc(#loc93) + %cst = arith.constant dense<128> : tensor<8x1xi32> loc(#loc93) + %0 = arith.muli %cst, %yindex_11 : tensor<8x1xi32> loc(#loc93) + %1 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc94) + %2 = tt.broadcast %0 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc94) + %3 = arith.addi %1, %2 : tensor<8x128xi32> loc(#loc94) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc95) + %5 = tt.addptr %4, %3 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc95) + %6 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc96) + %7 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc96) + %8 = arith.andi %6, %7 : tensor<8x128xi1> loc(#loc96) + %9 = arith.truncf %tmp38_155 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc97) + tt.store %5, %9, %8 : tensor<8x128x!tt.ptr> loc(#loc97) + tt.return loc(#loc98) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:60) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:70) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc108 = loc("ynumel"(#loc1)) +#loc109 = loc("xnumel"(#loc2)) +#loc110 = loc("yoffset"(#loc3)) +#loc111 = loc("yoffset"(#loc4)) +#loc112 = loc("yoffset"(#loc5)) +#loc113 = loc("yoffset"(#loc6)) +#loc114 = loc("yoffset"(#loc7)) +#loc115 = loc("yoffset"(#loc8)) +#loc116 = loc("yindex"(#loc9)) +#loc117 = loc("yindex"(#loc10)) +#loc118 = loc("yindex"(#loc11)) +#loc119 = loc("ymask"(#loc12)) +#loc120 = loc("xoffset"(#loc13)) +#loc121 = loc("xoffset"(#loc14)) +#loc122 = loc("xindex"(#loc15)) +#loc123 = loc("xindex"(#loc16)) +#loc124 = loc("xindex"(#loc17)) +#loc125 = loc("xmask"(#loc18)) +#loc126 = loc("y1"(#loc19)) +#loc127 = loc("y0"(#loc20)) +#loc128 = loc("tmp1"(#loc21)) +#loc129 = loc("tmp2"(#loc22)) +#loc130 = loc("tmp3"(#loc23)) +#loc131 = loc("tmp4"(#loc24)) +#loc132 = loc("tmp5"(#loc25)) +#loc133 = loc("tmp5"(#loc26)) +#loc134 = loc("tmp5"(#loc27)) +#loc135 = loc("tmp5"(#loc28)) +#loc136 = loc("tmp5"(#loc29)) +#loc137 = loc("tmp5"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp7"(#loc34)) +#loc142 = loc("tmp7"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp7"(#loc37)) +#loc145 = loc("tmp7"(#loc38)) +#loc146 = loc("tmp7"(#loc39)) +#loc147 = loc("tmp7"(#loc40)) +#loc148 = loc("tmp8"(#loc41)) +#loc149 = loc("tmp9"(#loc42)) +#loc150 = loc("tmp10"(#loc43)) +#loc151 = loc("tmp11"(#loc44)) +#loc152 = loc("tmp12"(#loc45)) +#loc153 = loc("tmp13"(#loc46)) +#loc154 = loc("tmp14"(#loc47)) +#loc155 = loc("tmp14"(#loc48)) +#loc156 = loc("tmp14"(#loc49)) +#loc157 = loc("tmp14"(#loc50)) +#loc158 = loc("tmp14"(#loc51)) +#loc159 = loc("tmp14"(#loc52)) +#loc160 = loc("tmp16"(#loc53)) +#loc161 = loc("tmp18"(#loc54)) +#loc162 = loc("tmp19"(#loc55)) +#loc163 = loc("tmp20"(#loc56)) +#loc164 = loc("tmp21"(#loc57)) +#loc165 = loc("tmp22"(#loc58)) +#loc166 = loc("tmp23"(#loc59)) +#loc167 = loc("tmp23"(#loc60)) +#loc168 = loc("tmp23"(#loc61)) +#loc169 = loc("tmp23"(#loc62)) +#loc170 = loc("tmp23"(#loc63)) +#loc171 = loc("tmp23"(#loc64)) +#loc172 = loc("tmp23"(#loc65)) +#loc173 = loc("tmp23"(#loc66)) +#loc174 = loc("tmp23"(#loc67)) +#loc175 = loc("tmp23"(#loc68)) +#loc176 = loc("tmp25"(#loc69)) +#loc177 = loc("tmp25"(#loc70)) +#loc178 = loc("tmp25"(#loc71)) +#loc179 = loc("tmp25"(#loc72)) +#loc180 = loc("tmp25"(#loc73)) +#loc181 = loc("tmp25"(#loc74)) +#loc182 = loc("tmp25"(#loc75)) +#loc183 = loc("tmp25"(#loc76)) +#loc184 = loc("tmp26"(#loc77)) +#loc185 = loc("tmp27"(#loc78)) +#loc186 = loc("tmp28"(#loc79)) +#loc187 = loc("tmp29"(#loc80)) +#loc188 = loc("tmp30"(#loc81)) +#loc189 = loc("tmp31"(#loc82)) +#loc190 = loc("tmp32"(#loc83)) +#loc191 = loc("tmp32"(#loc84)) +#loc192 = loc("tmp32"(#loc85)) +#loc193 = loc("tmp32"(#loc86)) +#loc194 = loc("tmp32"(#loc87)) +#loc195 = loc("tmp32"(#loc88)) +#loc196 = loc("tmp34"(#loc89)) +#loc197 = loc("tmp36"(#loc90)) +#loc198 = loc("tmp37"(#loc91)) +#loc199 = loc("tmp38"(#loc92)) diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f39978e3de144f8a83850e861421a5c781fa263a --- /dev/null +++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir @@ -0,0 +1,288 @@ +#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("in_ptr4"(#loc)) +#loc75 = loc("in_ptr5"(#loc)) +#loc76 = loc("out_ptr0"(#loc)) +#loc77 = loc("ynumel"(#loc)) +#loc78 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<-256> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<-256> : tensor<8x1xi32, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<8x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<8x1xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<256> : tensor<8x1xi64, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<256> : tensor<8x1xi64, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<32> : tensor<8x1xi32, #blocked1> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<1x128xi32, #blocked1> loc(#loc1) + %cst_9 = arith.constant dense<73728> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<73728> : tensor<8x1xi32, #blocked1> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked1> loc(#loc1) + %cst_12 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<9.99999997E-7> : tensor<8x128xf32, #blocked> loc(#loc1) + %cst_14 = arith.constant dense<1.280000e+02> : tensor<8x128xf32, #blocked> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked1> loc(#loc1) + %yoffset = tt.get_program_id y : i32 loc(#loc79) + %yoffset_16 = tt.get_program_id z : i32 loc(#loc80) + %yoffset_17 = tt.get_num_programs y : i32 loc(#loc81) + %yoffset_18 = arith.muli %yoffset_16, %yoffset_17 : i32 loc(#loc82) + %yoffset_19 = arith.addi %yoffset, %yoffset_18 : i32 loc(#loc83) + %yoffset_20 = arith.muli %yoffset_19, %c8_i32 : i32 loc(#loc84) + %yindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc85) + %yindex_21 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc85) + %yindex_22 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc85) + %yindex_23 = tt.expand_dims %yindex_21 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc85) + %yindex_24 = tt.splat %yoffset_20 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc86) + %yindex_25 = tt.splat %yoffset_20 : i32 -> tensor<8x1xi32, #blocked> loc(#loc86) + %yindex_26 = arith.addi %yindex_24, %yindex_22 : tensor<8x1xi32, #blocked1> loc(#loc86) + %yindex_27 = arith.addi %yindex_25, %yindex_23 : tensor<8x1xi32, #blocked> loc(#loc86) + %ymask = arith.cmpi slt, %yindex_26, %cst_10 : tensor<8x1xi32, #blocked1> loc(#loc87) + %ymask_28 = arith.cmpi slt, %yindex_27, %cst_9 : tensor<8x1xi32, #blocked> loc(#loc87) + %xoffset = tt.get_program_id x : i32 loc(#loc88) + %xoffset_29 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc89) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc90) + %xindex_30 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc90) + %xindex_31 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc90) + %xindex_32 = tt.expand_dims %xindex_30 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc90) + %xindex_33 = tt.splat %xoffset_29 : i32 -> tensor<1x128xi32, #blocked1> loc(#loc91) + %xindex_34 = tt.splat %xoffset_29 : i32 -> tensor<1x128xi32, #blocked> loc(#loc91) + %xindex_35 = arith.addi %xindex_33, %xindex_31 : tensor<1x128xi32, #blocked1> loc(#loc91) + %xindex_36 = arith.addi %xindex_34, %xindex_32 : tensor<1x128xi32, #blocked> loc(#loc91) + %xmask = arith.cmpi slt, %xindex_35, %cst_8 : tensor<1x128xi32, #blocked1> loc(#loc92) + %xmask_37 = arith.cmpi slt, %xindex_36, %cst_7 : tensor<1x128xi32, #blocked> loc(#loc92) + %y1 = arith.divsi %yindex_26, %cst_6 : tensor<8x1xi32, #blocked1> loc(#loc93) + %y1_38 = arith.divsi %yindex_27, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc93) + %y0 = arith.remsi %yindex_26, %cst_6 : tensor<8x1xi32, #blocked1> loc(#loc94) + %y0_39 = arith.remsi %yindex_27, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc94) + %tmp4 = arith.extsi %y1 : tensor<8x1xi32, #blocked1> to tensor<8x1xi64, #blocked1> loc(#loc95) + %tmp4_40 = arith.extsi %y1_38 : tensor<8x1xi32, #blocked> to tensor<8x1xi64, #blocked> loc(#loc95) + %tmp4_41 = arith.cmpi slt, %tmp4, %cst_4 : tensor<8x1xi64, #blocked1> loc(#loc95) + %tmp4_42 = arith.cmpi slt, %tmp4_40, %cst_3 : tensor<8x1xi64, #blocked> loc(#loc95) + %tmp5 = arith.muli %y0, %cst_2 : tensor<8x1xi32, #blocked1> loc(#loc96) + %tmp5_43 = tt.broadcast %xindex_35 : tensor<1x128xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc97) + %tmp5_44 = tt.broadcast %tmp5 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc97) + %tmp5_45 = arith.addi %tmp5_43, %tmp5_44 : tensor<8x128xi32, #blocked1> loc(#loc97) + %tmp5_46 = arith.muli %y1, %cst_1 : tensor<8x1xi32, #blocked1> loc(#loc98) + %tmp5_47 = tt.broadcast %tmp5_46 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc99) + %tmp5_48 = arith.addi %tmp5_45, %tmp5_47 : tensor<8x128xi32, #blocked1> loc(#loc99) + %tmp5_49 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked1> loc(#loc100) + %tmp5_50 = tt.addptr %tmp5_49, %tmp5_48 : tensor<8x128x!tt.ptr, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc100) + %tmp5_51 = tt.broadcast %tmp4_41 : tensor<8x1xi1, #blocked1> -> tensor<8x128xi1, #blocked1> loc(#loc101) + %tmp5_52 = tt.broadcast %tmp4_42 : tensor<8x1xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc101) + %tmp5_53 = tt.broadcast %xmask : tensor<1x128xi1, #blocked1> -> tensor<8x128xi1, #blocked1> loc(#loc101) + %tmp5_54 = tt.broadcast %xmask_37 : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc101) + %tmp5_55 = arith.andi %tmp5_51, %tmp5_53 : tensor<8x128xi1, #blocked1> loc(#loc101) + %tmp5_56 = arith.andi %tmp5_52, %tmp5_54 : tensor<8x128xi1, #blocked> loc(#loc101) + %tmp5_57 = tt.broadcast %ymask : tensor<8x1xi1, #blocked1> -> tensor<8x128xi1, #blocked1> loc(#loc102) + %tmp5_58 = tt.broadcast %ymask_28 : tensor<8x1xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc102) + %tmp5_59 = arith.andi %tmp5_55, %tmp5_57 : tensor<8x128xi1, #blocked1> loc(#loc102) + %tmp5_60 = arith.andi %tmp5_56, %tmp5_58 : tensor<8x128xi1, #blocked> loc(#loc102) + %tmp5_61 = tt.load %tmp5_50, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked1> loc(#loc103) + %tmp5_62 = ttg.convert_layout %tmp5_61 : tensor<8x128xbf16, #blocked1> -> tensor<8x128xbf16, #blocked> loc(#loc104) + %tmp5_63 = arith.extf %tmp5_62 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc104) + %tmp7 = arith.muli %y1_38, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc105) + %tmp7_64 = arith.addi %y0_39, %tmp7 : tensor<8x1xi32, #blocked> loc(#loc106) + %tmp7_65 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked> loc(#loc107) + %tmp7_66 = tt.addptr %tmp7_65, %tmp7_64 : tensor<8x1x!tt.ptr, #blocked>, tensor<8x1xi32, #blocked> loc(#loc107) + %tmp7_67 = tt.broadcast %tmp7_66 : tensor<8x1x!tt.ptr, #blocked> -> tensor<8x128x!tt.ptr, #blocked> loc(#loc107) + %tmp7_68 = tt.load %tmp7_67, %tmp5_60, %cst_12 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked> loc(#loc108) + %tmp9 = arith.divf %tmp7_68, %cst_14 : tensor<8x128xf32, #blocked> loc(#loc109) + %tmp11 = arith.addf %tmp9, %cst_13 : tensor<8x128xf32, #blocked> loc(#loc110) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32, #blocked>) -> tensor<8x128xf32, #blocked> loc(#loc111) + %tmp13 = arith.mulf %tmp5_63, %tmp12 : tensor<8x128xf32, #blocked> loc(#loc112) + %tmp13_69 = ttg.convert_layout %tmp13 : tensor<8x128xf32, #blocked> -> tensor<8x128xf32, #blocked1> loc(#loc112) + %tmp14 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked1> loc(#loc113) + %tmp14_70 = tt.addptr %tmp14, %xindex_35 : tensor<1x128x!tt.ptr, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc113) + %tmp14_71 = tt.broadcast %tmp14_70 : tensor<1x128x!tt.ptr, #blocked1> -> tensor<8x128x!tt.ptr, #blocked1> loc(#loc113) + %tmp14_72 = tt.load %tmp14_71, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked1> loc(#loc114) + %tmp14_73 = arith.extf %tmp14_72 : tensor<8x128xbf16, #blocked1> to tensor<8x128xf32, #blocked1> loc(#loc115) + %tmp16 = arith.mulf %tmp13_69, %tmp14_73 : tensor<8x128xf32, #blocked1> loc(#loc116) + %tmp20 = arith.cmpi sge, %tmp4, %cst_4 : tensor<8x1xi64, #blocked1> loc(#loc117) + %tmp20_74 = arith.cmpi sge, %tmp4_40, %cst_3 : tensor<8x1xi64, #blocked> loc(#loc117) + %tmp23 = arith.addi %y1, %cst_0 : tensor<8x1xi32, #blocked1> loc(#loc118) + %tmp23_75 = arith.addi %y1_38, %cst : tensor<8x1xi32, #blocked> loc(#loc118) + %tmp23_76 = arith.muli %tmp23, %cst_1 : tensor<8x1xi32, #blocked1> loc(#loc119) + %tmp23_77 = tt.broadcast %tmp23_76 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc120) + %tmp23_78 = arith.addi %tmp5_45, %tmp23_77 : tensor<8x128xi32, #blocked1> loc(#loc120) + %tmp23_79 = tt.splat %in_ptr3 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked1> loc(#loc121) + %tmp23_80 = tt.addptr %tmp23_79, %tmp23_78 : tensor<8x128x!tt.ptr, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc121) + %tmp23_81 = tt.broadcast %tmp20 : tensor<8x1xi1, #blocked1> -> tensor<8x128xi1, #blocked1> loc(#loc122) + %tmp23_82 = tt.broadcast %tmp20_74 : tensor<8x1xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc122) + %tmp23_83 = arith.andi %tmp23_81, %tmp5_53 : tensor<8x128xi1, #blocked1> loc(#loc122) + %tmp23_84 = arith.andi %tmp23_82, %tmp5_54 : tensor<8x128xi1, #blocked> loc(#loc122) + %tmp23_85 = arith.andi %tmp23_83, %tmp5_57 : tensor<8x128xi1, #blocked1> loc(#loc123) + %tmp23_86 = arith.andi %tmp23_84, %tmp5_58 : tensor<8x128xi1, #blocked> loc(#loc123) + %tmp23_87 = tt.load %tmp23_80, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked1> loc(#loc124) + %tmp23_88 = ttg.convert_layout %tmp23_87 : tensor<8x128xbf16, #blocked1> -> tensor<8x128xbf16, #blocked> loc(#loc125) + %tmp23_89 = arith.extf %tmp23_88 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc125) + %tmp25 = arith.muli %tmp23_75, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc126) + %tmp25_90 = arith.addi %y0_39, %tmp25 : tensor<8x1xi32, #blocked> loc(#loc127) + %tmp25_91 = tt.splat %in_ptr4 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked> loc(#loc128) + %tmp25_92 = tt.addptr %tmp25_91, %tmp25_90 : tensor<8x1x!tt.ptr, #blocked>, tensor<8x1xi32, #blocked> loc(#loc128) + %tmp25_93 = tt.broadcast %tmp25_92 : tensor<8x1x!tt.ptr, #blocked> -> tensor<8x128x!tt.ptr, #blocked> loc(#loc128) + %tmp25_94 = tt.load %tmp25_93, %tmp23_86, %cst_12 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked> loc(#loc129) + %tmp27 = arith.divf %tmp25_94, %cst_14 : tensor<8x128xf32, #blocked> loc(#loc130) + %tmp29 = arith.addf %tmp27, %cst_13 : tensor<8x128xf32, #blocked> loc(#loc131) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32, #blocked>) -> tensor<8x128xf32, #blocked> loc(#loc132) + %tmp31 = arith.mulf %tmp23_89, %tmp30 : tensor<8x128xf32, #blocked> loc(#loc133) + %tmp31_95 = ttg.convert_layout %tmp31 : tensor<8x128xf32, #blocked> -> tensor<8x128xf32, #blocked1> loc(#loc133) + %tmp32 = tt.splat %in_ptr5 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked1> loc(#loc134) + %tmp32_96 = tt.addptr %tmp32, %xindex_35 : tensor<1x128x!tt.ptr, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc134) + %tmp32_97 = tt.broadcast %tmp32_96 : tensor<1x128x!tt.ptr, #blocked1> -> tensor<8x128x!tt.ptr, #blocked1> loc(#loc134) + %tmp32_98 = tt.load %tmp32_97, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr, #blocked1> loc(#loc135) + %tmp32_99 = arith.extf %tmp32_98 : tensor<8x128xbf16, #blocked1> to tensor<8x128xf32, #blocked1> loc(#loc136) + %tmp34 = arith.mulf %tmp31_95, %tmp32_99 : tensor<8x128xf32, #blocked1> loc(#loc137) + %tmp37 = arith.select %tmp23_81, %tmp34, %cst_15 : tensor<8x128xi1, #blocked1>, tensor<8x128xf32, #blocked1> loc(#loc138) + %tmp38 = arith.select %tmp5_51, %tmp16, %tmp37 : tensor<8x128xi1, #blocked1>, tensor<8x128xf32, #blocked1> loc(#loc141) + %0 = arith.muli %yindex_26, %cst_2 : tensor<8x1xi32, #blocked1> loc(#loc64) + %1 = tt.broadcast %0 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc65) + %2 = arith.addi %tmp5_43, %1 : tensor<8x128xi32, #blocked1> loc(#loc65) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked1> loc(#loc66) + %4 = tt.addptr %3, %2 : tensor<8x128x!tt.ptr, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc66) + %5 = arith.andi %tmp5_53, %tmp5_57 : tensor<8x128xi1, #blocked1> loc(#loc67) + %6 = arith.truncf %tmp38 : tensor<8x128xf32, #blocked1> to tensor<8x128xbf16, #blocked1> loc(#loc68) + tt.store %4, %6, %5 : tensor<8x128x!tt.ptr, #blocked1> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc79 = loc("yoffset"(#loc2)) +#loc80 = loc("yoffset"(#loc3)) +#loc81 = loc("yoffset"(#loc4)) +#loc82 = loc("yoffset"(#loc5)) +#loc83 = loc("yoffset"(#loc6)) +#loc84 = loc("yoffset"(#loc7)) +#loc85 = loc("yindex"(#loc8)) +#loc86 = loc("yindex"(#loc9)) +#loc87 = loc("ymask"(#loc10)) +#loc88 = loc("xoffset"(#loc11)) +#loc89 = loc("xoffset"(#loc12)) +#loc90 = loc("xindex"(#loc13)) +#loc91 = loc("xindex"(#loc14)) +#loc92 = loc("xmask"(#loc15)) +#loc93 = loc("y1"(#loc16)) +#loc94 = loc("y0"(#loc17)) +#loc95 = loc("tmp4"(#loc18)) +#loc96 = loc("tmp5"(#loc19)) +#loc97 = loc("tmp5"(#loc20)) +#loc98 = loc("tmp5"(#loc21)) +#loc99 = loc("tmp5"(#loc22)) +#loc100 = loc("tmp5"(#loc23)) +#loc101 = loc("tmp5"(#loc24)) +#loc102 = loc("tmp5"(#loc25)) +#loc103 = loc("tmp5"(#loc26)) +#loc104 = loc("tmp5"(#loc27)) +#loc105 = loc("tmp7"(#loc28)) +#loc106 = loc("tmp7"(#loc29)) +#loc107 = loc("tmp7"(#loc30)) +#loc108 = loc("tmp7"(#loc31)) +#loc109 = loc("tmp9"(#loc32)) +#loc110 = loc("tmp11"(#loc33)) +#loc111 = loc("tmp12"(#loc34)) +#loc112 = loc("tmp13"(#loc35)) +#loc113 = loc("tmp14"(#loc36)) +#loc114 = loc("tmp14"(#loc37)) +#loc115 = loc("tmp14"(#loc38)) +#loc116 = loc("tmp16"(#loc39)) +#loc117 = loc("tmp20"(#loc40)) +#loc118 = loc("tmp23"(#loc41)) +#loc119 = loc("tmp23"(#loc42)) +#loc120 = loc("tmp23"(#loc43)) +#loc121 = loc("tmp23"(#loc44)) +#loc122 = loc("tmp23"(#loc45)) +#loc123 = loc("tmp23"(#loc46)) +#loc124 = loc("tmp23"(#loc47)) +#loc125 = loc("tmp23"(#loc48)) +#loc126 = loc("tmp25"(#loc49)) +#loc127 = loc("tmp25"(#loc50)) +#loc128 = loc("tmp25"(#loc51)) +#loc129 = loc("tmp25"(#loc52)) +#loc130 = loc("tmp27"(#loc53)) +#loc131 = loc("tmp29"(#loc54)) +#loc132 = loc("tmp30"(#loc55)) +#loc133 = loc("tmp31"(#loc56)) +#loc134 = loc("tmp32"(#loc57)) +#loc135 = loc("tmp32"(#loc58)) +#loc136 = loc("tmp32"(#loc59)) +#loc137 = loc("tmp34"(#loc60)) +#loc138 = loc("tmp37"(#loc61)) +#loc139 = loc("tmp38"(#loc62)) +#loc140 = loc("tmp19"(#loc63)) +#loc141 = loc(fused[#loc139, #loc140]) diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..7e341e428627154d59d283ecfe6fae7ca5b98f82 --- /dev/null +++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir @@ -0,0 +1,256 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("in_ptr1"(#loc)) +#loc74 = loc("in_ptr2"(#loc)) +#loc75 = loc("in_ptr3"(#loc)) +#loc76 = loc("in_ptr4"(#loc)) +#loc77 = loc("in_ptr5"(#loc)) +#loc78 = loc("out_ptr0"(#loc)) +#loc79 = loc("ynumel"(#loc)) +#loc80 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<8x128xbf16> loc(#loc1) + %cst_0 = arith.constant dense<-256> : tensor<8x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<9.99999997E-7> : tensor<8x128xf32> loc(#loc1) + %cst_2 = arith.constant dense<1.280000e+02> : tensor<8x128xf32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc1) + %cst_4 = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc1) + %cst_5 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc1) + %cst_6 = arith.constant dense<256> : tensor<8x1xi64> loc(#loc1) + %cst_7 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc1) + %xmask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc81) + %ymask = arith.constant dense<73728> : tensor<8x1xi32> loc(#loc82) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %yoffset = tt.get_program_id y : i32 loc(#loc83) + %yoffset_8 = tt.get_program_id z : i32 loc(#loc84) + %yoffset_9 = tt.get_num_programs y : i32 loc(#loc85) + %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc86) + %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc87) + %yoffset_12 = arith.muli %yoffset_11, %c8_i32 : i32 loc(#loc88) + %yindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc89) + %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc90) + %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<8x1xi32> loc(#loc91) + %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<8x1xi32> loc(#loc91) + %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<8x1xi32> loc(#loc82) + %xoffset = tt.get_program_id x : i32 loc(#loc92) + %xoffset_17 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc93) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc94) + %xindex_18 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc95) + %xindex_19 = tt.splat %xoffset_17 : i32 -> tensor<1x128xi32> loc(#loc96) + %xindex_20 = arith.addi %xindex_19, %xindex_18 : tensor<1x128xi32> loc(#loc96) + %xmask_21 = arith.cmpi slt, %xindex_20, %xmask : tensor<1x128xi32> loc(#loc81) + %y1 = arith.divsi %yindex_15, %cst_7 : tensor<8x1xi32> loc(#loc97) + %y0 = arith.remsi %yindex_15, %cst_7 : tensor<8x1xi32> loc(#loc98) + %tmp4 = arith.extsi %y1 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc99) + %tmp4_22 = arith.cmpi slt, %tmp4, %cst_6 : tensor<8x1xi64> loc(#loc99) + %tmp5 = arith.muli %y0, %cst_5 : tensor<8x1xi32> loc(#loc100) + %tmp5_23 = tt.broadcast %xindex_20 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc101) + %tmp5_24 = tt.broadcast %tmp5 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc101) + %tmp5_25 = arith.addi %tmp5_23, %tmp5_24 : tensor<8x128xi32> loc(#loc101) + %tmp5_26 = arith.muli %y1, %cst_4 : tensor<8x1xi32> loc(#loc102) + %tmp5_27 = tt.broadcast %tmp5_26 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc103) + %tmp5_28 = arith.addi %tmp5_25, %tmp5_27 : tensor<8x128xi32> loc(#loc103) + %tmp5_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc104) + %tmp5_30 = tt.addptr %tmp5_29, %tmp5_28 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc104) + %tmp5_31 = tt.broadcast %tmp4_22 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc105) + %tmp5_32 = tt.broadcast %xmask_21 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc105) + %tmp5_33 = arith.andi %tmp5_31, %tmp5_32 : tensor<8x128xi1> loc(#loc105) + %tmp5_34 = tt.broadcast %ymask_16 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc106) + %tmp5_35 = arith.andi %tmp5_33, %tmp5_34 : tensor<8x128xi1> loc(#loc106) + %tmp5_36 = tt.load %tmp5_30, %tmp5_35, %cst evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc107) + %tmp5_37 = arith.extf %tmp5_36 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc108) + %tmp7 = arith.muli %y1, %cst_7 : tensor<8x1xi32> loc(#loc109) + %tmp7_38 = arith.addi %y0, %tmp7 : tensor<8x1xi32> loc(#loc110) + %tmp7_39 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc111) + %tmp7_40 = tt.addptr %tmp7_39, %tmp7_38 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc111) + %tmp7_41 = tt.broadcast %tmp7_40 : tensor<8x1x!tt.ptr> -> tensor<8x128x!tt.ptr> loc(#loc111) + %tmp7_42 = tt.load %tmp7_41, %tmp5_35, %cst_3 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc112) + %tmp9 = arith.divf %tmp7_42, %cst_2 : tensor<8x128xf32> loc(#loc113) + %tmp11 = arith.addf %tmp9, %cst_1 : tensor<8x128xf32> loc(#loc114) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32>) -> tensor<8x128xf32> loc(#loc115) + %tmp13 = arith.mulf %tmp5_37, %tmp12 : tensor<8x128xf32> loc(#loc116) + %tmp14 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc117) + %tmp14_43 = tt.addptr %tmp14, %xindex_20 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc117) + %tmp14_44 = tt.broadcast %tmp14_43 : tensor<1x128x!tt.ptr> -> tensor<8x128x!tt.ptr> loc(#loc117) + %tmp14_45 = tt.load %tmp14_44, %tmp5_35, %cst evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc118) + %tmp14_46 = arith.extf %tmp14_45 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc119) + %tmp16 = arith.mulf %tmp13, %tmp14_46 : tensor<8x128xf32> loc(#loc120) + %tmp19 = arith.select %tmp5_31, %tmp16, %cst_3 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc121) + %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<8x1xi64> loc(#loc122) + %tmp23 = arith.addi %y1, %cst_0 : tensor<8x1xi32> loc(#loc123) + %tmp23_47 = arith.muli %tmp23, %cst_4 : tensor<8x1xi32> loc(#loc124) + %tmp23_48 = tt.broadcast %tmp23_47 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc125) + %tmp23_49 = arith.addi %tmp5_25, %tmp23_48 : tensor<8x128xi32> loc(#loc125) + %tmp23_50 = tt.splat %in_ptr3 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc126) + %tmp23_51 = tt.addptr %tmp23_50, %tmp23_49 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc126) + %tmp23_52 = tt.broadcast %tmp20 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc127) + %tmp23_53 = arith.andi %tmp23_52, %tmp5_32 : tensor<8x128xi1> loc(#loc127) + %tmp23_54 = arith.andi %tmp23_53, %tmp5_34 : tensor<8x128xi1> loc(#loc128) + %tmp23_55 = tt.load %tmp23_51, %tmp23_54, %cst evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc129) + %tmp23_56 = arith.extf %tmp23_55 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc130) + %tmp25 = arith.muli %tmp23, %cst_7 : tensor<8x1xi32> loc(#loc131) + %tmp25_57 = arith.addi %y0, %tmp25 : tensor<8x1xi32> loc(#loc132) + %tmp25_58 = tt.splat %in_ptr4 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc133) + %tmp25_59 = tt.addptr %tmp25_58, %tmp25_57 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc133) + %tmp25_60 = tt.broadcast %tmp25_59 : tensor<8x1x!tt.ptr> -> tensor<8x128x!tt.ptr> loc(#loc133) + %tmp25_61 = tt.load %tmp25_60, %tmp23_54, %cst_3 evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc134) + %tmp27 = arith.divf %tmp25_61, %cst_2 : tensor<8x128xf32> loc(#loc135) + %tmp29 = arith.addf %tmp27, %cst_1 : tensor<8x128xf32> loc(#loc136) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32>) -> tensor<8x128xf32> loc(#loc137) + %tmp31 = arith.mulf %tmp23_56, %tmp30 : tensor<8x128xf32> loc(#loc138) + %tmp32 = tt.splat %in_ptr5 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc139) + %tmp32_62 = tt.addptr %tmp32, %xindex_20 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc139) + %tmp32_63 = tt.broadcast %tmp32_62 : tensor<1x128x!tt.ptr> -> tensor<8x128x!tt.ptr> loc(#loc139) + %tmp32_64 = tt.load %tmp32_63, %tmp23_54, %cst evictionPolicy = evict_last : tensor<8x128x!tt.ptr> loc(#loc140) + %tmp32_65 = arith.extf %tmp32_64 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc141) + %tmp34 = arith.mulf %tmp31, %tmp32_65 : tensor<8x128xf32> loc(#loc142) + %tmp37 = arith.select %tmp23_52, %tmp34, %cst_3 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc143) + %tmp38 = arith.select %tmp5_31, %tmp19, %tmp37 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc144) + %0 = arith.muli %yindex_15, %cst_5 : tensor<8x1xi32> loc(#loc66) + %1 = tt.broadcast %0 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc67) + %2 = arith.addi %tmp5_23, %1 : tensor<8x128xi32> loc(#loc67) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc68) + %4 = tt.addptr %3, %2 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc68) + %5 = arith.andi %tmp5_32, %tmp5_34 : tensor<8x128xi1> loc(#loc69) + %6 = arith.truncf %tmp38 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc70) + tt.store %4, %6, %5 : tensor<8x128x!tt.ptr> loc(#loc70) + tt.return loc(#loc71) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc81 = loc("xmask"(#loc2)) +#loc82 = loc("ymask"(#loc3)) +#loc83 = loc("yoffset"(#loc4)) +#loc84 = loc("yoffset"(#loc5)) +#loc85 = loc("yoffset"(#loc6)) +#loc86 = loc("yoffset"(#loc7)) +#loc87 = loc("yoffset"(#loc8)) +#loc88 = loc("yoffset"(#loc9)) +#loc89 = loc("yindex"(#loc10)) +#loc90 = loc("yindex"(#loc11)) +#loc91 = loc("yindex"(#loc12)) +#loc92 = loc("xoffset"(#loc13)) +#loc93 = loc("xoffset"(#loc14)) +#loc94 = loc("xindex"(#loc15)) +#loc95 = loc("xindex"(#loc16)) +#loc96 = loc("xindex"(#loc17)) +#loc97 = loc("y1"(#loc18)) +#loc98 = loc("y0"(#loc19)) +#loc99 = loc("tmp4"(#loc20)) +#loc100 = loc("tmp5"(#loc21)) +#loc101 = loc("tmp5"(#loc22)) +#loc102 = loc("tmp5"(#loc23)) +#loc103 = loc("tmp5"(#loc24)) +#loc104 = loc("tmp5"(#loc25)) +#loc105 = loc("tmp5"(#loc26)) +#loc106 = loc("tmp5"(#loc27)) +#loc107 = loc("tmp5"(#loc28)) +#loc108 = loc("tmp5"(#loc29)) +#loc109 = loc("tmp7"(#loc30)) +#loc110 = loc("tmp7"(#loc31)) +#loc111 = loc("tmp7"(#loc32)) +#loc112 = loc("tmp7"(#loc33)) +#loc113 = loc("tmp9"(#loc34)) +#loc114 = loc("tmp11"(#loc35)) +#loc115 = loc("tmp12"(#loc36)) +#loc116 = loc("tmp13"(#loc37)) +#loc117 = loc("tmp14"(#loc38)) +#loc118 = loc("tmp14"(#loc39)) +#loc119 = loc("tmp14"(#loc40)) +#loc120 = loc("tmp16"(#loc41)) +#loc121 = loc("tmp19"(#loc42)) +#loc122 = loc("tmp20"(#loc43)) +#loc123 = loc("tmp23"(#loc44)) +#loc124 = loc("tmp23"(#loc45)) +#loc125 = loc("tmp23"(#loc46)) +#loc126 = loc("tmp23"(#loc47)) +#loc127 = loc("tmp23"(#loc48)) +#loc128 = loc("tmp23"(#loc49)) +#loc129 = loc("tmp23"(#loc50)) +#loc130 = loc("tmp23"(#loc51)) +#loc131 = loc("tmp25"(#loc52)) +#loc132 = loc("tmp25"(#loc53)) +#loc133 = loc("tmp25"(#loc54)) +#loc134 = loc("tmp25"(#loc55)) +#loc135 = loc("tmp27"(#loc56)) +#loc136 = loc("tmp29"(#loc57)) +#loc137 = loc("tmp30"(#loc58)) +#loc138 = loc("tmp31"(#loc59)) +#loc139 = loc("tmp32"(#loc60)) +#loc140 = loc("tmp32"(#loc61)) +#loc141 = loc("tmp32"(#loc62)) +#loc142 = loc("tmp34"(#loc63)) +#loc143 = loc("tmp37"(#loc64)) +#loc144 = loc("tmp38"(#loc65)) diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d0f9bf343483e2f266965205e64ea8913a226f37 --- /dev/null +++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}} \ No newline at end of file diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..20b59d0575e6e00352c9a4de76a8d172ad6bc435 Binary files /dev/null and b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..022927831500d71d0905d1826f5e5e31120ce875 --- /dev/null +++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"hash": "16de0d161145db7e150b8deccb0e64916988ec2fc9f2a88c668ea14096c730dd", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"} \ No newline at end of file diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..734b3218e9de1a9cf17afd0592ff8759783ac44a --- /dev/null +++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir @@ -0,0 +1,1426 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %13 = shl i32 %12, 6, !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %15 = and i32 %14, 504, !dbg !10 + %16 = lshr exact i32 %15, 3, !dbg !10 + %17 = or disjoint i32 %16, %13, !dbg !11 + %18 = and i32 %14, 7, !dbg !12 + %19 = shl nuw nsw i32 %18, 3, !dbg !12 + %20 = sdiv i32 %17, 32, !dbg !13 + %21 = shl i32 %17, 7 + %22 = shl i32 %20, 15 + %23 = add i32 %22, %21 + %24 = add i32 %23, 4096 + %25 = zext nneg i32 %19 to i64, !dbg !14 + %26 = or disjoint i32 %24, %19, !dbg !15 + %27 = sext i32 %26 to i64, !dbg !16 + %28 = getelementptr bfloat, ptr addrspace(1) %2, i64 %27, !dbg !16 + %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %30 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %28, i64 %29, i1 true) #6, !dbg !17 + %31 = extractvalue { i32, i32, i32, i32 } %30, 0, !dbg !17 + %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !17 + %33 = extractvalue { i32, i32, i32, i32 } %30, 1, !dbg !17 + %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !17 + %35 = extractvalue { i32, i32, i32, i32 } %30, 2, !dbg !17 + %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !17 + %37 = extractvalue { i32, i32, i32, i32 } %30, 3, !dbg !17 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !17 + %39 = extractelement <2 x bfloat> %32, i64 0, !dbg !17 + %40 = extractelement <2 x bfloat> %32, i64 1, !dbg !17 + %41 = extractelement <2 x bfloat> %34, i64 0, !dbg !17 + %42 = extractelement <2 x bfloat> %34, i64 1, !dbg !17 + %43 = extractelement <2 x bfloat> %36, i64 0, !dbg !17 + %44 = extractelement <2 x bfloat> %36, i64 1, !dbg !17 + %45 = extractelement <2 x bfloat> %38, i64 0, !dbg !17 + %46 = extractelement <2 x bfloat> %38, i64 1, !dbg !17 + %47 = fpext bfloat %39 to float, !dbg !18 + %48 = fpext bfloat %40 to float, !dbg !18 + %49 = fpext bfloat %41 to float, !dbg !18 + %50 = fpext bfloat %42 to float, !dbg !18 + %51 = fpext bfloat %43 to float, !dbg !18 + %52 = fpext bfloat %44 to float, !dbg !18 + %53 = fpext bfloat %45 to float, !dbg !18 + %54 = fpext bfloat %46 to float, !dbg !18 + %55 = or disjoint i32 %23, %19, !dbg !19 + %56 = sext i32 %55 to i64, !dbg !20 + %57 = getelementptr bfloat, ptr addrspace(1) %2, i64 %56, !dbg !20 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %58, i1 true) #6, !dbg !21 + %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !21 + %61 = bitcast i32 %60 to <2 x bfloat>, !dbg !21 + %62 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !21 + %63 = bitcast i32 %62 to <2 x bfloat>, !dbg !21 + %64 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !21 + %65 = bitcast i32 %64 to <2 x bfloat>, !dbg !21 + %66 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !21 + %67 = bitcast i32 %66 to <2 x bfloat>, !dbg !21 + %68 = extractelement <2 x bfloat> %61, i64 0, !dbg !21 + %69 = extractelement <2 x bfloat> %61, i64 1, !dbg !21 + %70 = extractelement <2 x bfloat> %63, i64 0, !dbg !21 + %71 = extractelement <2 x bfloat> %63, i64 1, !dbg !21 + %72 = extractelement <2 x bfloat> %65, i64 0, !dbg !21 + %73 = extractelement <2 x bfloat> %65, i64 1, !dbg !21 + %74 = extractelement <2 x bfloat> %67, i64 0, !dbg !21 + %75 = extractelement <2 x bfloat> %67, i64 1, !dbg !21 + %76 = fpext bfloat %68 to float, !dbg !22 + %77 = fpext bfloat %69 to float, !dbg !22 + %78 = fpext bfloat %70 to float, !dbg !22 + %79 = fpext bfloat %71 to float, !dbg !22 + %80 = fpext bfloat %72 to float, !dbg !22 + %81 = fpext bfloat %73 to float, !dbg !22 + %82 = fpext bfloat %74 to float, !dbg !22 + %83 = fpext bfloat %75 to float, !dbg !22 + %84 = fmul float %47, %47, !dbg !23 + %85 = fmul float %48, %48, !dbg !23 + %86 = fmul float %49, %49, !dbg !23 + %87 = fmul float %50, %50, !dbg !23 + %88 = fmul float %51, %51, !dbg !23 + %89 = fmul float %52, %52, !dbg !23 + %90 = fmul float %53, %53, !dbg !23 + %91 = fmul float %54, %54, !dbg !23 + %92 = fmul float %76, %76, !dbg !24 + %93 = fmul float %77, %77, !dbg !24 + %94 = fmul float %78, %78, !dbg !24 + %95 = fmul float %79, %79, !dbg !24 + %96 = fmul float %80, %80, !dbg !24 + %97 = fmul float %81, %81, !dbg !24 + %98 = fmul float %82, %82, !dbg !24 + %99 = fmul float %83, %83, !dbg !24 + %100 = or disjoint i32 %19, 64, !dbg !25 + %101 = or disjoint i32 %24, %100, !dbg !15 + %102 = sext i32 %101 to i64, !dbg !16 + %103 = getelementptr bfloat, ptr addrspace(1) %2, i64 %102, !dbg !16 + %104 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %105 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %103, i64 %104, i1 true) #6, !dbg !17 + %106 = extractvalue { i32, i32, i32, i32 } %105, 0, !dbg !17 + %107 = bitcast i32 %106 to <2 x bfloat>, !dbg !17 + %108 = extractvalue { i32, i32, i32, i32 } %105, 1, !dbg !17 + %109 = bitcast i32 %108 to <2 x bfloat>, !dbg !17 + %110 = extractvalue { i32, i32, i32, i32 } %105, 2, !dbg !17 + %111 = bitcast i32 %110 to <2 x bfloat>, !dbg !17 + %112 = extractvalue { i32, i32, i32, i32 } %105, 3, !dbg !17 + %113 = bitcast i32 %112 to <2 x bfloat>, !dbg !17 + %114 = extractelement <2 x bfloat> %107, i64 0, !dbg !17 + %115 = extractelement <2 x bfloat> %107, i64 1, !dbg !17 + %116 = extractelement <2 x bfloat> %109, i64 0, !dbg !17 + %117 = extractelement <2 x bfloat> %109, i64 1, !dbg !17 + %118 = extractelement <2 x bfloat> %111, i64 0, !dbg !17 + %119 = extractelement <2 x bfloat> %111, i64 1, !dbg !17 + %120 = extractelement <2 x bfloat> %113, i64 0, !dbg !17 + %121 = extractelement <2 x bfloat> %113, i64 1, !dbg !17 + %122 = fpext bfloat %114 to float, !dbg !18 + %123 = fpext bfloat %115 to float, !dbg !18 + %124 = fpext bfloat %116 to float, !dbg !18 + %125 = fpext bfloat %117 to float, !dbg !18 + %126 = fpext bfloat %118 to float, !dbg !18 + %127 = fpext bfloat %119 to float, !dbg !18 + %128 = fpext bfloat %120 to float, !dbg !18 + %129 = fpext bfloat %121 to float, !dbg !18 + %130 = or disjoint i32 %23, %100, !dbg !19 + %131 = sext i32 %130 to i64, !dbg !20 + %132 = getelementptr bfloat, ptr addrspace(1) %2, i64 %131, !dbg !20 + %133 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %134 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %132, i64 %133, i1 true) #6, !dbg !21 + %135 = extractvalue { i32, i32, i32, i32 } %134, 0, !dbg !21 + %136 = bitcast i32 %135 to <2 x bfloat>, !dbg !21 + %137 = extractvalue { i32, i32, i32, i32 } %134, 1, !dbg !21 + %138 = bitcast i32 %137 to <2 x bfloat>, !dbg !21 + %139 = extractvalue { i32, i32, i32, i32 } %134, 2, !dbg !21 + %140 = bitcast i32 %139 to <2 x bfloat>, !dbg !21 + %141 = extractvalue { i32, i32, i32, i32 } %134, 3, !dbg !21 + %142 = bitcast i32 %141 to <2 x bfloat>, !dbg !21 + %143 = extractelement <2 x bfloat> %136, i64 0, !dbg !21 + %144 = extractelement <2 x bfloat> %136, i64 1, !dbg !21 + %145 = extractelement <2 x bfloat> %138, i64 0, !dbg !21 + %146 = extractelement <2 x bfloat> %138, i64 1, !dbg !21 + %147 = extractelement <2 x bfloat> %140, i64 0, !dbg !21 + %148 = extractelement <2 x bfloat> %140, i64 1, !dbg !21 + %149 = extractelement <2 x bfloat> %142, i64 0, !dbg !21 + %150 = extractelement <2 x bfloat> %142, i64 1, !dbg !21 + %151 = fpext bfloat %143 to float, !dbg !22 + %152 = fpext bfloat %144 to float, !dbg !22 + %153 = fpext bfloat %145 to float, !dbg !22 + %154 = fpext bfloat %146 to float, !dbg !22 + %155 = fpext bfloat %147 to float, !dbg !22 + %156 = fpext bfloat %148 to float, !dbg !22 + %157 = fpext bfloat %149 to float, !dbg !22 + %158 = fpext bfloat %150 to float, !dbg !22 + %159 = fmul float %122, %122, !dbg !23 + %160 = fmul float %123, %123, !dbg !23 + %161 = fmul float %124, %124, !dbg !23 + %162 = fmul float %125, %125, !dbg !23 + %163 = fmul float %126, %126, !dbg !23 + %164 = fmul float %127, %127, !dbg !23 + %165 = fmul float %128, %128, !dbg !23 + %166 = fmul float %129, %129, !dbg !23 + %167 = fadd float %84, %159, !dbg !26 + %168 = fadd float %85, %160, !dbg !26 + %169 = fadd float %86, %161, !dbg !26 + %170 = fadd float %87, %162, !dbg !26 + %171 = fadd float %88, %163, !dbg !26 + %172 = fadd float %89, %164, !dbg !26 + %173 = fadd float %90, %165, !dbg !26 + %174 = fadd float %91, %166, !dbg !26 + %175 = fmul float %151, %151, !dbg !24 + %176 = fmul float %152, %152, !dbg !24 + %177 = fmul float %153, %153, !dbg !24 + %178 = fmul float %154, %154, !dbg !24 + %179 = fmul float %155, %155, !dbg !24 + %180 = fmul float %156, %156, !dbg !24 + %181 = fmul float %157, %157, !dbg !24 + %182 = fmul float %158, %158, !dbg !24 + %183 = fadd float %92, %175, !dbg !27 + %184 = fadd float %93, %176, !dbg !27 + %185 = fadd float %94, %177, !dbg !27 + %186 = fadd float %95, %178, !dbg !27 + %187 = fadd float %96, %179, !dbg !27 + %188 = fadd float %97, %180, !dbg !27 + %189 = fadd float %98, %181, !dbg !27 + %190 = fadd float %99, %182, !dbg !27 + %191 = and i32 %14, 63, !dbg !10 + %192 = or disjoint i32 %13, %191, !dbg !11 + %193 = lshr i32 %14, 6, !dbg !12 + %194 = and i32 %193, 6, !dbg !12 + %195 = sdiv i32 %192, 32, !dbg !13 + %196 = fadd float %167, %168, !dbg !28 + %197 = fadd float %169, %196, !dbg !28 + %198 = fadd float %170, %197, !dbg !28 + %199 = fadd float %171, %198, !dbg !28 + %200 = fadd float %172, %199, !dbg !28 + %201 = fadd float %173, %200, !dbg !28 + %202 = fadd float %174, %201, !dbg !28 + %203 = bitcast float %202 to i32, !dbg !31 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !31 + %205 = bitcast i32 %204 to float, !dbg !31 + %206 = fadd float %202, %205, !dbg !28 + %207 = bitcast float %206 to i32, !dbg !31 + %208 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %207, i32 2, i32 31), !dbg !31 + %209 = bitcast i32 %208 to float, !dbg !31 + %210 = fadd float %206, %209, !dbg !28 + %211 = bitcast float %210 to i32, !dbg !31 + %212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 1, i32 31), !dbg !31 + %213 = bitcast i32 %212 to float, !dbg !31 + %214 = fadd float %210, %213, !dbg !28 + %215 = fadd float %183, %184, !dbg !34 + %216 = fadd float %185, %215, !dbg !34 + %217 = fadd float %186, %216, !dbg !34 + %218 = fadd float %187, %217, !dbg !34 + %219 = fadd float %188, %218, !dbg !34 + %220 = fadd float %189, %219, !dbg !34 + %221 = fadd float %190, %220, !dbg !34 + %222 = bitcast float %221 to i32, !dbg !35 + %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 4, i32 31), !dbg !35 + %224 = bitcast i32 %223 to float, !dbg !35 + %225 = fadd float %221, %224, !dbg !34 + %226 = bitcast float %225 to i32, !dbg !35 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 2, i32 31), !dbg !35 + %228 = bitcast i32 %227 to float, !dbg !35 + %229 = fadd float %225, %228, !dbg !34 + %230 = bitcast float %229 to i32, !dbg !35 + %231 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %230, i32 1, i32 31), !dbg !35 + %232 = bitcast i32 %231 to float, !dbg !35 + %233 = fadd float %229, %232, !dbg !34 + %234 = shl i32 %20, 7, !dbg !37 + %235 = tail call float @llvm.nvvm.div.full(float %233, float 1.280000e+02), !dbg !38 + %236 = fadd float %235, 0x3EB0C6F7A0000000, !dbg !39 + %237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %.not.i = icmp eq i32 %237, 0, !dbg !40 + br i1 %.not.i, label %240, label %238, !dbg !40 + +238: ; preds = %11 + %239 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %236), !dbg !40 + br label %__nv_rsqrtf.exit, !dbg !40 + +240: ; preds = %11 + %241 = tail call float @llvm.nvvm.rsqrt.approx.f(float %236), !dbg !40 + br label %__nv_rsqrtf.exit, !dbg !40 + +__nv_rsqrtf.exit: ; preds = %238, %240 + %.0.i = phi float [ %239, %238 ], [ %241, %240 ], !dbg !40 + %242 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %244 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %246 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %247 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %.not.i34 = icmp eq i32 %248, 0, !dbg !40 + br i1 %.not.i34, label %251, label %249, !dbg !40 + +249: ; preds = %__nv_rsqrtf.exit + %250 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %236), !dbg !40 + br label %__nv_rsqrtf.exit36, !dbg !40 + +251: ; preds = %__nv_rsqrtf.exit + %252 = tail call float @llvm.nvvm.rsqrt.approx.f(float %236), !dbg !40 + br label %__nv_rsqrtf.exit36, !dbg !40 + +__nv_rsqrtf.exit36: ; preds = %249, %251 + %.0.i35 = phi float [ %250, %249 ], [ %252, %251 ], !dbg !40 + %253 = lshr exact i32 %15, 1, !dbg !41 + %254 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %253, !dbg !41 + store float %.0.i, ptr addrspace(3) %254, align 4, !dbg !41 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41 + %255 = shl nuw nsw i32 %191, 2, !dbg !41 + %256 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %255, !dbg !41 + %257 = load float, ptr addrspace(3) %256, align 4, !dbg !41 + %258 = tail call float @llvm.nvvm.div.full(float %214, float 1.280000e+02), !dbg !42 + %259 = fadd float %258, 0x3EB0C6F7A0000000, !dbg !43 + %260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i37 = icmp eq i32 %260, 0, !dbg !44 + br i1 %.not.i37, label %263, label %261, !dbg !44 + +261: ; preds = %__nv_rsqrtf.exit36 + %262 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %259), !dbg !44 + br label %__nv_rsqrtf.exit39, !dbg !44 + +263: ; preds = %__nv_rsqrtf.exit36 + %264 = tail call float @llvm.nvvm.rsqrt.approx.f(float %259), !dbg !44 + br label %__nv_rsqrtf.exit39, !dbg !44 + +__nv_rsqrtf.exit39: ; preds = %261, %263 + %.0.i38 = phi float [ %262, %261 ], [ %264, %263 ], !dbg !44 + %265 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %266 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %267 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %268 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %269 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i58 = icmp eq i32 %271, 0, !dbg !44 + br i1 %.not.i58, label %274, label %272, !dbg !44 + +272: ; preds = %__nv_rsqrtf.exit39 + %273 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %259), !dbg !44 + br label %__nv_rsqrtf.exit60, !dbg !44 + +274: ; preds = %__nv_rsqrtf.exit39 + %275 = tail call float @llvm.nvvm.rsqrt.approx.f(float %259), !dbg !44 + br label %__nv_rsqrtf.exit60, !dbg !44 + +__nv_rsqrtf.exit60: ; preds = %272, %274 + %.0.i59 = phi float [ %273, %272 ], [ %275, %274 ], !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + store float %.0.i38, ptr addrspace(3) %254, align 4, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %276 = load float, ptr addrspace(3) %256, align 4, !dbg !45 + %277 = shl i32 %17, 7, !dbg !46 + %278 = and i32 %193, 1 + %279 = or disjoint i32 %19, %234 + %280 = and i32 %14, 224 + %281 = shl nuw nsw i32 %280, 6 + %282 = shl nuw nsw i32 %14, 2 + %283 = and i32 %282, 124 + %284 = lshr exact i32 %280, 3 + %285 = lshr i32 %14, 1 + %286 = and i32 %285, 128 + %287 = or disjoint i32 %281, %283 + %288 = xor i32 %287, %284 + %289 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %286 + %290 = getelementptr inbounds nuw i8, ptr addrspace(3) %289, i32 %288 + %291 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 256 + %292 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 512 + %293 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 768 + %294 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 1024 + %295 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 1280 + %296 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 1536 + %297 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 1792 + %298 = and i32 %14, 28 + %299 = shl nuw nsw i32 %298, 9 + %300 = shl nuw nsw i32 %14, 5 + %301 = and i32 %300, 96 + %302 = and i32 %282, 1920 + %303 = or disjoint i32 %299, %301 + %304 = or disjoint i32 %303, %302 + %305 = or disjoint i32 %304, %298 + %306 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %305 + %307 = xor i32 %305, 4 + %308 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %307 + %309 = xor i32 %305, 8 + %310 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %309 + %311 = xor i32 %305, 12 + %312 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %311 + %313 = xor i32 %305, 16 + %314 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %313 + %315 = xor i32 %305, 20 + %316 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %315 + %317 = xor i32 %305, 24 + %318 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %317 + %319 = xor i32 %305, 28 + %320 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %319 + %321 = icmp eq i32 %278, 0 + %322 = shl i32 %192, 7 + %323 = shl i32 %195, 15 + %324 = add i32 %323, %322 + %325 = icmp ne i32 %278, 0 + %326 = add i32 %324, 4097 + %327 = add i32 %324, 4096 + %328 = shl nuw nsw i32 %298, 8 + %329 = shl nuw nsw i32 %14, 1 + %330 = and i32 %329, 768 + %331 = lshr i32 %14, 5 + %332 = and i32 %331, 2 + %333 = or disjoint i32 %330, %332 + %334 = or disjoint i32 %333, %328 + %335 = or disjoint i32 %334, %255 + %336 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %335 + %337 = xor i32 %335, 16 + %338 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %337 + %339 = xor i32 %335, 32 + %340 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %339 + %341 = xor i32 %335, 48 + %342 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %341 + %343 = xor i32 %335, 64 + %344 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %343 + %345 = xor i32 %335, 80 + %346 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %345 + %347 = xor i32 %335, 96 + %348 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %347 + %349 = xor i32 %335, 112 + %350 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %349 + %351 = shl nuw nsw i32 %280, 5 + %352 = shl nuw nsw i32 %18, 4 + %353 = or disjoint i32 %351, %352 + %354 = xor i32 %353, %253 + %355 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %354 + %356 = getelementptr inbounds nuw i8, ptr addrspace(3) %355, i32 256 + %357 = getelementptr inbounds nuw i8, ptr addrspace(3) %355, i32 512 + %358 = getelementptr inbounds nuw i8, ptr addrspace(3) %355, i32 768 + %359 = zext nneg i32 %194 to i64, !dbg !47 + %360 = sext i32 %234 to i64, !dbg !47 + %361 = sext i32 %277 to i64, !dbg !47 + %362 = or disjoint i32 %279, 4, !dbg !47 + %invariant.op = sext i32 %362 to i64, !dbg !47 + br label %363, !dbg !47 + +363: ; preds = %__nv_rsqrtf.exit60, %363 + %364 = phi i1 [ true, %__nv_rsqrtf.exit60 ], [ false, %363 ] + %indvars.iv = phi i64 [ 0, %__nv_rsqrtf.exit60 ], [ 64, %363 ] + %365 = or disjoint i64 %indvars.iv, %25, !dbg !48 + %366 = or disjoint i64 %indvars.iv, %359, !dbg !48 + %367 = or disjoint i64 %366, 48, !dbg !48 + %368 = or disjoint i64 %366, 8, !dbg !49 + %369 = or disjoint i64 %366, 16, !dbg !49 + %370 = or disjoint i64 %366, 24, !dbg !49 + %371 = or disjoint i64 %366, 32, !dbg !49 + %372 = or disjoint i64 %366, 40, !dbg !49 + %373 = or disjoint i64 %366, 56, !dbg !49 + %374 = trunc nuw nsw i64 %365 to i32, !dbg !50 + %375 = or disjoint i32 %23, %374, !dbg !50 + %376 = sext i32 %375 to i64, !dbg !51 + %377 = getelementptr bfloat, ptr addrspace(1) %2, i64 %376, !dbg !51 + %378 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !52 + %379 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %377, i64 %378, i1 true) #6, !dbg !52 + %380 = extractvalue { i32, i32, i32, i32 } %379, 0, !dbg !52 + %381 = bitcast i32 %380 to <2 x bfloat>, !dbg !52 + %382 = extractvalue { i32, i32, i32, i32 } %379, 1, !dbg !52 + %383 = bitcast i32 %382 to <2 x bfloat>, !dbg !52 + %384 = extractvalue { i32, i32, i32, i32 } %379, 2, !dbg !52 + %385 = bitcast i32 %384 to <2 x bfloat>, !dbg !52 + %386 = extractvalue { i32, i32, i32, i32 } %379, 3, !dbg !52 + %387 = bitcast i32 %386 to <2 x bfloat>, !dbg !52 + %388 = extractelement <2 x bfloat> %381, i64 0, !dbg !52 + %389 = extractelement <2 x bfloat> %381, i64 1, !dbg !52 + %390 = extractelement <2 x bfloat> %383, i64 0, !dbg !52 + %391 = extractelement <2 x bfloat> %383, i64 1, !dbg !52 + %392 = extractelement <2 x bfloat> %385, i64 0, !dbg !52 + %393 = extractelement <2 x bfloat> %385, i64 1, !dbg !52 + %394 = extractelement <2 x bfloat> %387, i64 0, !dbg !52 + %395 = extractelement <2 x bfloat> %387, i64 1, !dbg !52 + %396 = fpext bfloat %388 to float, !dbg !53 + %397 = fpext bfloat %389 to float, !dbg !53 + %398 = fpext bfloat %390 to float, !dbg !53 + %399 = fpext bfloat %391 to float, !dbg !53 + %400 = fpext bfloat %392 to float, !dbg !53 + %401 = fpext bfloat %393 to float, !dbg !53 + %402 = fpext bfloat %394 to float, !dbg !53 + %403 = fpext bfloat %395 to float, !dbg !53 + %404 = getelementptr bfloat, ptr addrspace(1) %3, i64 %365, !dbg !54 + %405 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %406 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %404, i64 %405, i1 true) #6, !dbg !55 + %407 = extractvalue { i32, i32, i32, i32 } %406, 0, !dbg !55 + %408 = bitcast i32 %407 to <2 x bfloat>, !dbg !55 + %409 = extractvalue { i32, i32, i32, i32 } %406, 1, !dbg !55 + %410 = bitcast i32 %409 to <2 x bfloat>, !dbg !55 + %411 = extractvalue { i32, i32, i32, i32 } %406, 2, !dbg !55 + %412 = bitcast i32 %411 to <2 x bfloat>, !dbg !55 + %413 = extractvalue { i32, i32, i32, i32 } %406, 3, !dbg !55 + %414 = bitcast i32 %413 to <2 x bfloat>, !dbg !55 + %415 = extractelement <2 x bfloat> %408, i64 0, !dbg !55 + %416 = extractelement <2 x bfloat> %408, i64 1, !dbg !55 + %417 = extractelement <2 x bfloat> %410, i64 0, !dbg !55 + %418 = extractelement <2 x bfloat> %410, i64 1, !dbg !55 + %419 = extractelement <2 x bfloat> %412, i64 0, !dbg !55 + %420 = extractelement <2 x bfloat> %412, i64 1, !dbg !55 + %421 = extractelement <2 x bfloat> %414, i64 0, !dbg !55 + %422 = extractelement <2 x bfloat> %414, i64 1, !dbg !55 + %423 = fpext bfloat %415 to float, !dbg !56 + %424 = fpext bfloat %416 to float, !dbg !56 + %425 = fpext bfloat %417 to float, !dbg !56 + %426 = fpext bfloat %418 to float, !dbg !56 + %427 = fpext bfloat %419 to float, !dbg !56 + %428 = fpext bfloat %420 to float, !dbg !56 + %429 = fpext bfloat %421 to float, !dbg !56 + %430 = fpext bfloat %422 to float, !dbg !56 + %431 = or disjoint i64 %365, %360, !dbg !57 + %.reass = or disjoint i64 %indvars.iv, %invariant.op + %432 = getelementptr float, ptr addrspace(1) %4, i64 %431, !dbg !58 + %433 = getelementptr float, ptr addrspace(1) %4, i64 %.reass, !dbg !58 + %434 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !59 + %435 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %432, i64 %434, i1 true) #6, !dbg !59 + %436 = extractvalue { i32, i32, i32, i32 } %435, 0, !dbg !59 + %437 = extractvalue { i32, i32, i32, i32 } %435, 1, !dbg !59 + %438 = extractvalue { i32, i32, i32, i32 } %435, 2, !dbg !59 + %439 = extractvalue { i32, i32, i32, i32 } %435, 3, !dbg !59 + %440 = bitcast i32 %436 to float, !dbg !59 + %441 = bitcast i32 %437 to float, !dbg !59 + %442 = bitcast i32 %438 to float, !dbg !59 + %443 = bitcast i32 %439 to float, !dbg !59 + %444 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !59 + %445 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %433, i64 %444, i1 true) #6, !dbg !59 + %446 = extractvalue { i32, i32, i32, i32 } %445, 0, !dbg !59 + %447 = extractvalue { i32, i32, i32, i32 } %445, 1, !dbg !59 + %448 = extractvalue { i32, i32, i32, i32 } %445, 2, !dbg !59 + %449 = extractvalue { i32, i32, i32, i32 } %445, 3, !dbg !59 + %450 = bitcast i32 %446 to float, !dbg !59 + %451 = bitcast i32 %447 to float, !dbg !59 + %452 = bitcast i32 %448 to float, !dbg !59 + %453 = bitcast i32 %449 to float, !dbg !59 + %454 = getelementptr float, ptr addrspace(1) %5, i64 %431, !dbg !60 + %455 = getelementptr float, ptr addrspace(1) %5, i64 %.reass, !dbg !60 + %456 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !61 + %457 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %454, i64 %456, i1 true) #6, !dbg !61 + %458 = extractvalue { i32, i32, i32, i32 } %457, 0, !dbg !61 + %459 = extractvalue { i32, i32, i32, i32 } %457, 1, !dbg !61 + %460 = extractvalue { i32, i32, i32, i32 } %457, 2, !dbg !61 + %461 = extractvalue { i32, i32, i32, i32 } %457, 3, !dbg !61 + %462 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !61 + %463 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %455, i64 %462, i1 true) #6, !dbg !61 + %464 = extractvalue { i32, i32, i32, i32 } %463, 0, !dbg !61 + %465 = extractvalue { i32, i32, i32, i32 } %463, 1, !dbg !61 + %466 = extractvalue { i32, i32, i32, i32 } %463, 2, !dbg !61 + %467 = extractvalue { i32, i32, i32, i32 } %463, 3, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %468 = insertelement <1 x i32> poison, i32 %458, i64 0, !dbg !61 + store <1 x i32> %468, ptr addrspace(3) %290, align 4, !dbg !61 + %469 = insertelement <1 x i32> poison, i32 %459, i64 0, !dbg !61 + store <1 x i32> %469, ptr addrspace(3) %291, align 4, !dbg !61 + %470 = insertelement <1 x i32> poison, i32 %460, i64 0, !dbg !61 + store <1 x i32> %470, ptr addrspace(3) %292, align 4, !dbg !61 + %471 = insertelement <1 x i32> poison, i32 %461, i64 0, !dbg !61 + store <1 x i32> %471, ptr addrspace(3) %293, align 4, !dbg !61 + %472 = insertelement <1 x i32> poison, i32 %464, i64 0, !dbg !61 + store <1 x i32> %472, ptr addrspace(3) %294, align 4, !dbg !61 + %473 = insertelement <1 x i32> poison, i32 %465, i64 0, !dbg !61 + store <1 x i32> %473, ptr addrspace(3) %295, align 4, !dbg !61 + %474 = insertelement <1 x i32> poison, i32 %466, i64 0, !dbg !61 + store <1 x i32> %474, ptr addrspace(3) %296, align 4, !dbg !61 + %475 = insertelement <1 x i32> poison, i32 %467, i64 0, !dbg !61 + store <1 x i32> %475, ptr addrspace(3) %297, align 4, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %476 = load float, ptr addrspace(3) %306, align 4, !dbg !61 + %477 = load float, ptr addrspace(3) %308, align 4, !dbg !61 + %478 = load float, ptr addrspace(3) %310, align 4, !dbg !61 + %479 = load float, ptr addrspace(3) %312, align 4, !dbg !61 + %480 = load float, ptr addrspace(3) %314, align 4, !dbg !61 + %481 = load float, ptr addrspace(3) %316, align 4, !dbg !61 + %482 = load float, ptr addrspace(3) %318, align 4, !dbg !61 + %483 = load float, ptr addrspace(3) %320, align 4, !dbg !61 + %484 = or disjoint i32 %24, %374, !dbg !62 + %485 = sext i32 %484 to i64, !dbg !63 + %486 = getelementptr bfloat, ptr addrspace(1) %2, i64 %485, !dbg !63 + %487 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %488 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %486, i64 %487, i1 true) #6, !dbg !64 + %489 = extractvalue { i32, i32, i32, i32 } %488, 0, !dbg !64 + %490 = bitcast i32 %489 to <2 x bfloat>, !dbg !64 + %491 = extractvalue { i32, i32, i32, i32 } %488, 1, !dbg !64 + %492 = bitcast i32 %491 to <2 x bfloat>, !dbg !64 + %493 = extractvalue { i32, i32, i32, i32 } %488, 2, !dbg !64 + %494 = bitcast i32 %493 to <2 x bfloat>, !dbg !64 + %495 = extractvalue { i32, i32, i32, i32 } %488, 3, !dbg !64 + %496 = bitcast i32 %495 to <2 x bfloat>, !dbg !64 + %497 = extractelement <2 x bfloat> %490, i64 0, !dbg !64 + %498 = extractelement <2 x bfloat> %490, i64 1, !dbg !64 + %499 = extractelement <2 x bfloat> %492, i64 0, !dbg !64 + %500 = extractelement <2 x bfloat> %492, i64 1, !dbg !64 + %501 = extractelement <2 x bfloat> %494, i64 0, !dbg !64 + %502 = extractelement <2 x bfloat> %494, i64 1, !dbg !64 + %503 = extractelement <2 x bfloat> %496, i64 0, !dbg !64 + %504 = extractelement <2 x bfloat> %496, i64 1, !dbg !64 + %505 = fpext bfloat %497 to float, !dbg !65 + %506 = fpext bfloat %498 to float, !dbg !65 + %507 = fpext bfloat %499 to float, !dbg !65 + %508 = fpext bfloat %500 to float, !dbg !65 + %509 = fpext bfloat %501 to float, !dbg !65 + %510 = fpext bfloat %502 to float, !dbg !65 + %511 = fpext bfloat %503 to float, !dbg !65 + %512 = fpext bfloat %504 to float, !dbg !65 + %513 = getelementptr bfloat, ptr addrspace(1) %6, i64 %365, !dbg !66 + %514 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !67 + %515 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %513, i64 %514, i1 true) #6, !dbg !67 + %516 = extractvalue { i32, i32, i32, i32 } %515, 0, !dbg !67 + %517 = bitcast i32 %516 to <2 x bfloat>, !dbg !67 + %518 = extractvalue { i32, i32, i32, i32 } %515, 1, !dbg !67 + %519 = bitcast i32 %518 to <2 x bfloat>, !dbg !67 + %520 = extractvalue { i32, i32, i32, i32 } %515, 2, !dbg !67 + %521 = bitcast i32 %520 to <2 x bfloat>, !dbg !67 + %522 = extractvalue { i32, i32, i32, i32 } %515, 3, !dbg !67 + %523 = bitcast i32 %522 to <2 x bfloat>, !dbg !67 + %524 = extractelement <2 x bfloat> %517, i64 0, !dbg !67 + %525 = extractelement <2 x bfloat> %517, i64 1, !dbg !67 + %526 = extractelement <2 x bfloat> %519, i64 0, !dbg !67 + %527 = extractelement <2 x bfloat> %519, i64 1, !dbg !67 + %528 = extractelement <2 x bfloat> %521, i64 0, !dbg !67 + %529 = extractelement <2 x bfloat> %521, i64 1, !dbg !67 + %530 = extractelement <2 x bfloat> %523, i64 0, !dbg !67 + %531 = extractelement <2 x bfloat> %523, i64 1, !dbg !67 + %532 = fpext bfloat %524 to float, !dbg !68 + %533 = fpext bfloat %525 to float, !dbg !68 + %534 = fpext bfloat %526 to float, !dbg !68 + %535 = fpext bfloat %527 to float, !dbg !68 + %536 = fpext bfloat %528 to float, !dbg !68 + %537 = fpext bfloat %529 to float, !dbg !68 + %538 = fpext bfloat %530 to float, !dbg !68 + %539 = fpext bfloat %531 to float, !dbg !68 + %540 = or disjoint i64 %366, 1, !dbg !69 + %541 = or disjoint i64 %366, 9, !dbg !69 + %542 = or disjoint i64 %366, 17, !dbg !69 + %543 = or disjoint i64 %366, 25, !dbg !69 + %544 = or disjoint i64 %366, 33, !dbg !69 + %545 = or disjoint i64 %366, 41, !dbg !69 + %546 = or disjoint i64 %366, 49, !dbg !69 + %547 = or disjoint i64 %366, 57, !dbg !69 + %548 = trunc nuw nsw i64 %540 to i32, !dbg !70 + %549 = or disjoint i32 %324, %548, !dbg !70 + %550 = trunc nuw nsw i64 %541 to i32, !dbg !70 + %551 = or disjoint i32 %324, %550, !dbg !70 + %552 = trunc nuw nsw i64 %542 to i32, !dbg !70 + %553 = or disjoint i32 %324, %552, !dbg !70 + %554 = trunc nuw nsw i64 %543 to i32, !dbg !70 + %555 = or disjoint i32 %324, %554, !dbg !70 + %556 = trunc nuw nsw i64 %544 to i32, !dbg !70 + %557 = or disjoint i32 %324, %556, !dbg !70 + %558 = trunc nuw nsw i64 %545 to i32, !dbg !70 + %559 = or disjoint i32 %324, %558, !dbg !70 + %560 = trunc nuw nsw i64 %546 to i32, !dbg !70 + %561 = or disjoint i32 %324, %560, !dbg !70 + %562 = trunc nuw nsw i64 %547 to i32, !dbg !70 + %563 = or disjoint i32 %324, %562, !dbg !70 + %564 = sext i32 %549 to i64, !dbg !71 + %565 = getelementptr bfloat, ptr addrspace(1) %2, i64 %564, !dbg !71 + %566 = sext i32 %551 to i64, !dbg !71 + %567 = getelementptr bfloat, ptr addrspace(1) %2, i64 %566, !dbg !71 + %568 = sext i32 %553 to i64, !dbg !71 + %569 = getelementptr bfloat, ptr addrspace(1) %2, i64 %568, !dbg !71 + %570 = sext i32 %555 to i64, !dbg !71 + %571 = getelementptr bfloat, ptr addrspace(1) %2, i64 %570, !dbg !71 + %572 = sext i32 %557 to i64, !dbg !71 + %573 = getelementptr bfloat, ptr addrspace(1) %2, i64 %572, !dbg !71 + %574 = sext i32 %559 to i64, !dbg !71 + %575 = getelementptr bfloat, ptr addrspace(1) %2, i64 %574, !dbg !71 + %576 = sext i32 %561 to i64, !dbg !71 + %577 = getelementptr bfloat, ptr addrspace(1) %2, i64 %576, !dbg !71 + %578 = sext i32 %563 to i64, !dbg !71 + %579 = getelementptr bfloat, ptr addrspace(1) %2, i64 %578, !dbg !71 + %580 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %581 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %565, i64 %580, i1 %321) #6, !dbg !72 + %582 = bitcast i16 %581 to bfloat, !dbg !72 + %583 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %584 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %567, i64 %583, i1 %321) #6, !dbg !72 + %585 = bitcast i16 %584 to bfloat, !dbg !72 + %586 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %587 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %569, i64 %586, i1 %321) #6, !dbg !72 + %588 = bitcast i16 %587 to bfloat, !dbg !72 + %589 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %590 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %571, i64 %589, i1 %321) #6, !dbg !72 + %591 = bitcast i16 %590 to bfloat, !dbg !72 + %592 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %593 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %573, i64 %592, i1 %321) #6, !dbg !72 + %594 = bitcast i16 %593 to bfloat, !dbg !72 + %595 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %596 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %575, i64 %595, i1 %321) #6, !dbg !72 + %597 = bitcast i16 %596 to bfloat, !dbg !72 + %598 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %599 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %577, i64 %598, i1 %321) #6, !dbg !72 + %600 = bitcast i16 %599 to bfloat, !dbg !72 + %601 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %602 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %579, i64 %601, i1 %321) #6, !dbg !72 + %603 = bitcast i16 %602 to bfloat, !dbg !72 + %604 = fpext bfloat %582 to float, !dbg !73 + %605 = fpext bfloat %585 to float, !dbg !73 + %606 = fpext bfloat %588 to float, !dbg !73 + %607 = fpext bfloat %591 to float, !dbg !73 + %608 = fpext bfloat %594 to float, !dbg !73 + %609 = fpext bfloat %597 to float, !dbg !73 + %610 = fpext bfloat %600 to float, !dbg !73 + %611 = fpext bfloat %603 to float, !dbg !73 + %612 = fmul float %257, %604, !dbg !41 + %613 = fmul float %257, %605, !dbg !41 + %614 = fmul float %257, %606, !dbg !41 + %615 = fmul float %257, %607, !dbg !41 + %616 = fmul float %257, %608, !dbg !41 + %617 = fmul float %257, %609, !dbg !41 + %618 = fmul float %257, %610, !dbg !41 + %619 = fmul float %257, %611, !dbg !41 + %620 = getelementptr bfloat, ptr addrspace(1) %3, i64 %540, !dbg !74 + %621 = getelementptr bfloat, ptr addrspace(1) %3, i64 %541, !dbg !74 + %622 = getelementptr bfloat, ptr addrspace(1) %3, i64 %542, !dbg !74 + %623 = getelementptr bfloat, ptr addrspace(1) %3, i64 %543, !dbg !74 + %624 = getelementptr bfloat, ptr addrspace(1) %3, i64 %544, !dbg !74 + %625 = getelementptr bfloat, ptr addrspace(1) %3, i64 %545, !dbg !74 + %626 = getelementptr bfloat, ptr addrspace(1) %3, i64 %546, !dbg !74 + %627 = getelementptr bfloat, ptr addrspace(1) %3, i64 %547, !dbg !74 + %628 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %629 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %620, i64 %628, i1 %321) #6, !dbg !75 + %630 = bitcast i16 %629 to bfloat, !dbg !75 + %631 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %632 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %621, i64 %631, i1 %321) #6, !dbg !75 + %633 = bitcast i16 %632 to bfloat, !dbg !75 + %634 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %635 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %622, i64 %634, i1 %321) #6, !dbg !75 + %636 = bitcast i16 %635 to bfloat, !dbg !75 + %637 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %638 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %623, i64 %637, i1 %321) #6, !dbg !75 + %639 = bitcast i16 %638 to bfloat, !dbg !75 + %640 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %641 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %624, i64 %640, i1 %321) #6, !dbg !75 + %642 = bitcast i16 %641 to bfloat, !dbg !75 + %643 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %644 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %625, i64 %643, i1 %321) #6, !dbg !75 + %645 = bitcast i16 %644 to bfloat, !dbg !75 + %646 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %647 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %626, i64 %646, i1 %321) #6, !dbg !75 + %648 = bitcast i16 %647 to bfloat, !dbg !75 + %649 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %650 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %627, i64 %649, i1 %321) #6, !dbg !75 + %651 = bitcast i16 %650 to bfloat, !dbg !75 + %652 = fpext bfloat %630 to float, !dbg !76 + %653 = fpext bfloat %633 to float, !dbg !76 + %654 = fpext bfloat %636 to float, !dbg !76 + %655 = fpext bfloat %639 to float, !dbg !76 + %656 = fpext bfloat %642 to float, !dbg !76 + %657 = fpext bfloat %645 to float, !dbg !76 + %658 = fpext bfloat %648 to float, !dbg !76 + %659 = fpext bfloat %651 to float, !dbg !76 + %660 = fmul float %612, %652, !dbg !77 + %661 = fmul float %613, %653, !dbg !77 + %662 = fmul float %614, %654, !dbg !77 + %663 = fmul float %615, %655, !dbg !77 + %664 = fmul float %616, %656, !dbg !77 + %665 = fmul float %617, %657, !dbg !77 + %666 = fmul float %618, %658, !dbg !77 + %667 = fmul float %619, %659, !dbg !77 + %668 = fsub float 0.000000e+00, %660, !dbg !78 + %669 = fsub float 0.000000e+00, %661, !dbg !78 + %670 = fsub float 0.000000e+00, %662, !dbg !78 + %671 = fsub float 0.000000e+00, %663, !dbg !78 + %672 = fsub float 0.000000e+00, %664, !dbg !78 + %673 = fsub float 0.000000e+00, %665, !dbg !78 + %674 = fsub float 0.000000e+00, %666, !dbg !78 + %675 = fsub float 0.000000e+00, %667, !dbg !78 + %676 = trunc nuw nsw i64 %366 to i32, !dbg !79 + %677 = or disjoint i32 %324, %676, !dbg !79 + %678 = trunc nuw nsw i64 %368 to i32, !dbg !79 + %679 = or disjoint i32 %324, %678, !dbg !79 + %680 = trunc nuw nsw i64 %369 to i32, !dbg !79 + %681 = or disjoint i32 %324, %680, !dbg !79 + %682 = trunc nuw nsw i64 %370 to i32, !dbg !79 + %683 = or disjoint i32 %324, %682, !dbg !79 + %684 = trunc nuw nsw i64 %371 to i32, !dbg !79 + %685 = or disjoint i32 %324, %684, !dbg !79 + %686 = trunc nuw nsw i64 %372 to i32, !dbg !79 + %687 = or disjoint i32 %324, %686, !dbg !79 + %688 = trunc nuw nsw i64 %367 to i32, !dbg !79 + %689 = or disjoint i32 %324, %688, !dbg !79 + %690 = trunc nuw nsw i64 %373 to i32, !dbg !79 + %691 = or disjoint i32 %324, %690, !dbg !79 + %692 = sext i32 %677 to i64, !dbg !80 + %693 = getelementptr bfloat, ptr addrspace(1) %2, i64 %692, !dbg !80 + %694 = sext i32 %679 to i64, !dbg !80 + %695 = getelementptr bfloat, ptr addrspace(1) %2, i64 %694, !dbg !80 + %696 = sext i32 %681 to i64, !dbg !80 + %697 = getelementptr bfloat, ptr addrspace(1) %2, i64 %696, !dbg !80 + %698 = sext i32 %683 to i64, !dbg !80 + %699 = getelementptr bfloat, ptr addrspace(1) %2, i64 %698, !dbg !80 + %700 = sext i32 %685 to i64, !dbg !80 + %701 = getelementptr bfloat, ptr addrspace(1) %2, i64 %700, !dbg !80 + %702 = sext i32 %687 to i64, !dbg !80 + %703 = getelementptr bfloat, ptr addrspace(1) %2, i64 %702, !dbg !80 + %704 = sext i32 %689 to i64, !dbg !80 + %705 = getelementptr bfloat, ptr addrspace(1) %2, i64 %704, !dbg !80 + %706 = sext i32 %691 to i64, !dbg !80 + %707 = getelementptr bfloat, ptr addrspace(1) %2, i64 %706, !dbg !80 + %708 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %709 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %693, i64 %708, i1 %325) #6, !dbg !81 + %710 = bitcast i16 %709 to bfloat, !dbg !81 + %711 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %712 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %695, i64 %711, i1 %325) #6, !dbg !81 + %713 = bitcast i16 %712 to bfloat, !dbg !81 + %714 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %715 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %697, i64 %714, i1 %325) #6, !dbg !81 + %716 = bitcast i16 %715 to bfloat, !dbg !81 + %717 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %718 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %699, i64 %717, i1 %325) #6, !dbg !81 + %719 = bitcast i16 %718 to bfloat, !dbg !81 + %720 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %721 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %701, i64 %720, i1 %325) #6, !dbg !81 + %722 = bitcast i16 %721 to bfloat, !dbg !81 + %723 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %724 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %703, i64 %723, i1 %325) #6, !dbg !81 + %725 = bitcast i16 %724 to bfloat, !dbg !81 + %726 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %727 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %705, i64 %726, i1 %325) #6, !dbg !81 + %728 = bitcast i16 %727 to bfloat, !dbg !81 + %729 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %730 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %707, i64 %729, i1 %325) #6, !dbg !81 + %731 = bitcast i16 %730 to bfloat, !dbg !81 + %732 = fpext bfloat %710 to float, !dbg !82 + %733 = fpext bfloat %713 to float, !dbg !82 + %734 = fpext bfloat %716 to float, !dbg !82 + %735 = fpext bfloat %719 to float, !dbg !82 + %736 = fpext bfloat %722 to float, !dbg !82 + %737 = fpext bfloat %725 to float, !dbg !82 + %738 = fpext bfloat %728 to float, !dbg !82 + %739 = fpext bfloat %731 to float, !dbg !82 + %740 = fmul float %257, %732, !dbg !83 + %741 = fmul float %257, %733, !dbg !83 + %742 = fmul float %257, %734, !dbg !83 + %743 = fmul float %257, %735, !dbg !83 + %744 = fmul float %257, %736, !dbg !83 + %745 = fmul float %257, %737, !dbg !83 + %746 = fmul float %257, %738, !dbg !83 + %747 = fmul float %257, %739, !dbg !83 + %748 = getelementptr bfloat, ptr addrspace(1) %3, i64 %366, !dbg !84 + %749 = getelementptr bfloat, ptr addrspace(1) %3, i64 %368, !dbg !84 + %750 = getelementptr bfloat, ptr addrspace(1) %3, i64 %369, !dbg !84 + %751 = getelementptr bfloat, ptr addrspace(1) %3, i64 %370, !dbg !84 + %752 = getelementptr bfloat, ptr addrspace(1) %3, i64 %371, !dbg !84 + %753 = getelementptr bfloat, ptr addrspace(1) %3, i64 %372, !dbg !84 + %754 = getelementptr bfloat, ptr addrspace(1) %3, i64 %367, !dbg !84 + %755 = getelementptr bfloat, ptr addrspace(1) %3, i64 %373, !dbg !84 + %756 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %757 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %748, i64 %756, i1 %325) #6, !dbg !85 + %758 = bitcast i16 %757 to bfloat, !dbg !85 + %759 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %760 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %749, i64 %759, i1 %325) #6, !dbg !85 + %761 = bitcast i16 %760 to bfloat, !dbg !85 + %762 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %763 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %750, i64 %762, i1 %325) #6, !dbg !85 + %764 = bitcast i16 %763 to bfloat, !dbg !85 + %765 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %766 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %751, i64 %765, i1 %325) #6, !dbg !85 + %767 = bitcast i16 %766 to bfloat, !dbg !85 + %768 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %769 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %752, i64 %768, i1 %325) #6, !dbg !85 + %770 = bitcast i16 %769 to bfloat, !dbg !85 + %771 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %772 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %753, i64 %771, i1 %325) #6, !dbg !85 + %773 = bitcast i16 %772 to bfloat, !dbg !85 + %774 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %775 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %754, i64 %774, i1 %325) #6, !dbg !85 + %776 = bitcast i16 %775 to bfloat, !dbg !85 + %777 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %778 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %755, i64 %777, i1 %325) #6, !dbg !85 + %779 = bitcast i16 %778 to bfloat, !dbg !85 + %780 = fpext bfloat %758 to float, !dbg !86 + %781 = fpext bfloat %761 to float, !dbg !86 + %782 = fpext bfloat %764 to float, !dbg !86 + %783 = fpext bfloat %767 to float, !dbg !86 + %784 = fpext bfloat %770 to float, !dbg !86 + %785 = fpext bfloat %773 to float, !dbg !86 + %786 = fpext bfloat %776 to float, !dbg !86 + %787 = fpext bfloat %779 to float, !dbg !86 + %788 = fmul float %740, %780, !dbg !87 + %789 = fmul float %741, %781, !dbg !87 + %790 = fmul float %742, %782, !dbg !87 + %791 = fmul float %743, %783, !dbg !87 + %792 = fmul float %744, %784, !dbg !87 + %793 = fmul float %745, %785, !dbg !87 + %794 = fmul float %746, %786, !dbg !87 + %795 = fmul float %747, %787, !dbg !87 + %796 = select i1 %321, float %668, float %788, !dbg !88 + %797 = select i1 %321, float %669, float %789, !dbg !88 + %798 = select i1 %321, float %670, float %790, !dbg !88 + %799 = select i1 %321, float %671, float %791, !dbg !88 + %800 = select i1 %321, float %672, float %792, !dbg !88 + %801 = select i1 %321, float %673, float %793, !dbg !88 + %802 = select i1 %321, float %674, float %794, !dbg !88 + %803 = select i1 %321, float %675, float %795, !dbg !88 + %804 = fmul float %.0.i35, %396, !dbg !89 + %805 = fmul float %.0.i35, %397, !dbg !89 + %806 = fmul float %.0.i35, %398, !dbg !89 + %807 = fmul float %.0.i35, %399, !dbg !89 + %808 = fmul float %.0.i35, %400, !dbg !89 + %809 = fmul float %.0.i35, %401, !dbg !89 + %810 = fmul float %.0.i35, %402, !dbg !89 + %811 = fmul float %.0.i35, %403, !dbg !89 + %812 = fmul float %804, %423, !dbg !90 + %813 = fmul float %805, %424, !dbg !90 + %814 = fmul float %806, %425, !dbg !90 + %815 = fmul float %807, %426, !dbg !90 + %816 = fmul float %808, %427, !dbg !90 + %817 = fmul float %809, %428, !dbg !90 + %818 = fmul float %810, %429, !dbg !90 + %819 = fmul float %811, %430, !dbg !90 + %820 = fmul float %812, %440, !dbg !91 + %821 = fmul float %813, %441, !dbg !91 + %822 = fmul float %814, %442, !dbg !91 + %823 = fmul float %815, %443, !dbg !91 + %824 = fmul float %816, %450, !dbg !91 + %825 = fmul float %817, %451, !dbg !91 + %826 = fmul float %818, %452, !dbg !91 + %827 = fmul float %819, %453, !dbg !91 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91 + store float %820, ptr addrspace(3) %290, align 4, !dbg !91 + store float %821, ptr addrspace(3) %291, align 4, !dbg !91 + store float %822, ptr addrspace(3) %292, align 4, !dbg !91 + store float %823, ptr addrspace(3) %293, align 4, !dbg !91 + store float %824, ptr addrspace(3) %294, align 4, !dbg !91 + store float %825, ptr addrspace(3) %295, align 4, !dbg !91 + store float %826, ptr addrspace(3) %296, align 4, !dbg !91 + store float %827, ptr addrspace(3) %297, align 4, !dbg !91 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91 + %828 = load float, ptr addrspace(3) %306, align 4, !dbg !91 + %829 = load float, ptr addrspace(3) %308, align 4, !dbg !91 + %830 = load float, ptr addrspace(3) %310, align 4, !dbg !91 + %831 = load float, ptr addrspace(3) %312, align 4, !dbg !91 + %832 = load float, ptr addrspace(3) %314, align 4, !dbg !91 + %833 = load float, ptr addrspace(3) %316, align 4, !dbg !91 + %834 = load float, ptr addrspace(3) %318, align 4, !dbg !91 + %835 = load float, ptr addrspace(3) %320, align 4, !dbg !91 + %836 = fmul float %476, %796, !dbg !92 + %837 = fmul float %477, %797, !dbg !92 + %838 = fmul float %478, %798, !dbg !92 + %839 = fmul float %479, %799, !dbg !92 + %840 = fmul float %480, %800, !dbg !92 + %841 = fmul float %481, %801, !dbg !92 + %842 = fmul float %482, %802, !dbg !92 + %843 = fmul float %483, %803, !dbg !92 + %844 = fadd float %836, %828, !dbg !93 + %845 = fadd float %837, %829, !dbg !93 + %846 = fadd float %838, %830, !dbg !93 + %847 = fadd float %839, %831, !dbg !93 + %848 = fadd float %840, %832, !dbg !93 + %849 = fadd float %841, %833, !dbg !93 + %850 = fadd float %842, %834, !dbg !93 + %851 = fadd float %843, %835, !dbg !93 + %852 = or disjoint i32 %326, %676, !dbg !94 + %853 = or disjoint i32 %326, %678, !dbg !94 + %854 = or disjoint i32 %326, %680, !dbg !94 + %855 = or disjoint i32 %326, %682, !dbg !94 + %856 = or disjoint i32 %326, %684, !dbg !94 + %857 = or disjoint i32 %326, %686, !dbg !94 + %858 = or disjoint i32 %326, %688, !dbg !94 + %859 = or disjoint i32 %326, %690, !dbg !94 + %860 = sext i32 %852 to i64, !dbg !95 + %861 = getelementptr bfloat, ptr addrspace(1) %2, i64 %860, !dbg !95 + %862 = sext i32 %853 to i64, !dbg !95 + %863 = getelementptr bfloat, ptr addrspace(1) %2, i64 %862, !dbg !95 + %864 = sext i32 %854 to i64, !dbg !95 + %865 = getelementptr bfloat, ptr addrspace(1) %2, i64 %864, !dbg !95 + %866 = sext i32 %855 to i64, !dbg !95 + %867 = getelementptr bfloat, ptr addrspace(1) %2, i64 %866, !dbg !95 + %868 = sext i32 %856 to i64, !dbg !95 + %869 = getelementptr bfloat, ptr addrspace(1) %2, i64 %868, !dbg !95 + %870 = sext i32 %857 to i64, !dbg !95 + %871 = getelementptr bfloat, ptr addrspace(1) %2, i64 %870, !dbg !95 + %872 = sext i32 %858 to i64, !dbg !95 + %873 = getelementptr bfloat, ptr addrspace(1) %2, i64 %872, !dbg !95 + %874 = sext i32 %859 to i64, !dbg !95 + %875 = getelementptr bfloat, ptr addrspace(1) %2, i64 %874, !dbg !95 + %876 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %877 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %861, i64 %876, i1 %321) #6, !dbg !96 + %878 = bitcast i16 %877 to bfloat, !dbg !96 + %879 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %880 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %863, i64 %879, i1 %321) #6, !dbg !96 + %881 = bitcast i16 %880 to bfloat, !dbg !96 + %882 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %883 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %865, i64 %882, i1 %321) #6, !dbg !96 + %884 = bitcast i16 %883 to bfloat, !dbg !96 + %885 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %886 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %867, i64 %885, i1 %321) #6, !dbg !96 + %887 = bitcast i16 %886 to bfloat, !dbg !96 + %888 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %889 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %869, i64 %888, i1 %321) #6, !dbg !96 + %890 = bitcast i16 %889 to bfloat, !dbg !96 + %891 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %892 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %871, i64 %891, i1 %321) #6, !dbg !96 + %893 = bitcast i16 %892 to bfloat, !dbg !96 + %894 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %895 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %873, i64 %894, i1 %321) #6, !dbg !96 + %896 = bitcast i16 %895 to bfloat, !dbg !96 + %897 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %898 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %875, i64 %897, i1 %321) #6, !dbg !96 + %899 = bitcast i16 %898 to bfloat, !dbg !96 + %900 = fpext bfloat %878 to float, !dbg !97 + %901 = fpext bfloat %881 to float, !dbg !97 + %902 = fpext bfloat %884 to float, !dbg !97 + %903 = fpext bfloat %887 to float, !dbg !97 + %904 = fpext bfloat %890 to float, !dbg !97 + %905 = fpext bfloat %893 to float, !dbg !97 + %906 = fpext bfloat %896 to float, !dbg !97 + %907 = fpext bfloat %899 to float, !dbg !97 + %908 = fmul float %276, %900, !dbg !45 + %909 = fmul float %276, %901, !dbg !45 + %910 = fmul float %276, %902, !dbg !45 + %911 = fmul float %276, %903, !dbg !45 + %912 = fmul float %276, %904, !dbg !45 + %913 = fmul float %276, %905, !dbg !45 + %914 = fmul float %276, %906, !dbg !45 + %915 = fmul float %276, %907, !dbg !45 + %916 = getelementptr bfloat, ptr addrspace(1) %6, i64 %540, !dbg !98 + %917 = getelementptr bfloat, ptr addrspace(1) %6, i64 %541, !dbg !98 + %918 = getelementptr bfloat, ptr addrspace(1) %6, i64 %542, !dbg !98 + %919 = getelementptr bfloat, ptr addrspace(1) %6, i64 %543, !dbg !98 + %920 = getelementptr bfloat, ptr addrspace(1) %6, i64 %544, !dbg !98 + %921 = getelementptr bfloat, ptr addrspace(1) %6, i64 %545, !dbg !98 + %922 = getelementptr bfloat, ptr addrspace(1) %6, i64 %546, !dbg !98 + %923 = getelementptr bfloat, ptr addrspace(1) %6, i64 %547, !dbg !98 + %924 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %925 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %916, i64 %924, i1 %321) #6, !dbg !99 + %926 = bitcast i16 %925 to bfloat, !dbg !99 + %927 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %928 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %917, i64 %927, i1 %321) #6, !dbg !99 + %929 = bitcast i16 %928 to bfloat, !dbg !99 + %930 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %931 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %918, i64 %930, i1 %321) #6, !dbg !99 + %932 = bitcast i16 %931 to bfloat, !dbg !99 + %933 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %934 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %919, i64 %933, i1 %321) #6, !dbg !99 + %935 = bitcast i16 %934 to bfloat, !dbg !99 + %936 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %937 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %920, i64 %936, i1 %321) #6, !dbg !99 + %938 = bitcast i16 %937 to bfloat, !dbg !99 + %939 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %940 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %921, i64 %939, i1 %321) #6, !dbg !99 + %941 = bitcast i16 %940 to bfloat, !dbg !99 + %942 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %943 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %922, i64 %942, i1 %321) #6, !dbg !99 + %944 = bitcast i16 %943 to bfloat, !dbg !99 + %945 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %946 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %923, i64 %945, i1 %321) #6, !dbg !99 + %947 = bitcast i16 %946 to bfloat, !dbg !99 + %948 = fpext bfloat %926 to float, !dbg !100 + %949 = fpext bfloat %929 to float, !dbg !100 + %950 = fpext bfloat %932 to float, !dbg !100 + %951 = fpext bfloat %935 to float, !dbg !100 + %952 = fpext bfloat %938 to float, !dbg !100 + %953 = fpext bfloat %941 to float, !dbg !100 + %954 = fpext bfloat %944 to float, !dbg !100 + %955 = fpext bfloat %947 to float, !dbg !100 + %956 = fmul float %908, %948, !dbg !101 + %957 = fmul float %909, %949, !dbg !101 + %958 = fmul float %910, %950, !dbg !101 + %959 = fmul float %911, %951, !dbg !101 + %960 = fmul float %912, %952, !dbg !101 + %961 = fmul float %913, %953, !dbg !101 + %962 = fmul float %914, %954, !dbg !101 + %963 = fmul float %915, %955, !dbg !101 + %964 = fsub float 0.000000e+00, %956, !dbg !102 + %965 = fsub float 0.000000e+00, %957, !dbg !102 + %966 = fsub float 0.000000e+00, %958, !dbg !102 + %967 = fsub float 0.000000e+00, %959, !dbg !102 + %968 = fsub float 0.000000e+00, %960, !dbg !102 + %969 = fsub float 0.000000e+00, %961, !dbg !102 + %970 = fsub float 0.000000e+00, %962, !dbg !102 + %971 = fsub float 0.000000e+00, %963, !dbg !102 + %972 = or disjoint i32 %327, %676, !dbg !103 + %973 = or disjoint i32 %327, %678, !dbg !103 + %974 = or disjoint i32 %327, %680, !dbg !103 + %975 = or disjoint i32 %327, %682, !dbg !103 + %976 = or disjoint i32 %327, %684, !dbg !103 + %977 = or disjoint i32 %327, %686, !dbg !103 + %978 = or disjoint i32 %327, %688, !dbg !103 + %979 = or disjoint i32 %327, %690, !dbg !103 + %980 = sext i32 %972 to i64, !dbg !104 + %981 = getelementptr bfloat, ptr addrspace(1) %2, i64 %980, !dbg !104 + %982 = sext i32 %973 to i64, !dbg !104 + %983 = getelementptr bfloat, ptr addrspace(1) %2, i64 %982, !dbg !104 + %984 = sext i32 %974 to i64, !dbg !104 + %985 = getelementptr bfloat, ptr addrspace(1) %2, i64 %984, !dbg !104 + %986 = sext i32 %975 to i64, !dbg !104 + %987 = getelementptr bfloat, ptr addrspace(1) %2, i64 %986, !dbg !104 + %988 = sext i32 %976 to i64, !dbg !104 + %989 = getelementptr bfloat, ptr addrspace(1) %2, i64 %988, !dbg !104 + %990 = sext i32 %977 to i64, !dbg !104 + %991 = getelementptr bfloat, ptr addrspace(1) %2, i64 %990, !dbg !104 + %992 = sext i32 %978 to i64, !dbg !104 + %993 = getelementptr bfloat, ptr addrspace(1) %2, i64 %992, !dbg !104 + %994 = sext i32 %979 to i64, !dbg !104 + %995 = getelementptr bfloat, ptr addrspace(1) %2, i64 %994, !dbg !104 + %996 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %997 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %981, i64 %996, i1 %325) #6, !dbg !105 + %998 = bitcast i16 %997 to bfloat, !dbg !105 + %999 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %1000 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %983, i64 %999, i1 %325) #6, !dbg !105 + %1001 = bitcast i16 %1000 to bfloat, !dbg !105 + %1002 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %1003 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %985, i64 %1002, i1 %325) #6, !dbg !105 + %1004 = bitcast i16 %1003 to bfloat, !dbg !105 + %1005 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %1006 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %987, i64 %1005, i1 %325) #6, !dbg !105 + %1007 = bitcast i16 %1006 to bfloat, !dbg !105 + %1008 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %1009 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %989, i64 %1008, i1 %325) #6, !dbg !105 + %1010 = bitcast i16 %1009 to bfloat, !dbg !105 + %1011 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %1012 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %991, i64 %1011, i1 %325) #6, !dbg !105 + %1013 = bitcast i16 %1012 to bfloat, !dbg !105 + %1014 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %1015 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %993, i64 %1014, i1 %325) #6, !dbg !105 + %1016 = bitcast i16 %1015 to bfloat, !dbg !105 + %1017 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %1018 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %995, i64 %1017, i1 %325) #6, !dbg !105 + %1019 = bitcast i16 %1018 to bfloat, !dbg !105 + %1020 = fpext bfloat %998 to float, !dbg !106 + %1021 = fpext bfloat %1001 to float, !dbg !106 + %1022 = fpext bfloat %1004 to float, !dbg !106 + %1023 = fpext bfloat %1007 to float, !dbg !106 + %1024 = fpext bfloat %1010 to float, !dbg !106 + %1025 = fpext bfloat %1013 to float, !dbg !106 + %1026 = fpext bfloat %1016 to float, !dbg !106 + %1027 = fpext bfloat %1019 to float, !dbg !106 + %1028 = fmul float %276, %1020, !dbg !107 + %1029 = fmul float %276, %1021, !dbg !107 + %1030 = fmul float %276, %1022, !dbg !107 + %1031 = fmul float %276, %1023, !dbg !107 + %1032 = fmul float %276, %1024, !dbg !107 + %1033 = fmul float %276, %1025, !dbg !107 + %1034 = fmul float %276, %1026, !dbg !107 + %1035 = fmul float %276, %1027, !dbg !107 + %1036 = getelementptr bfloat, ptr addrspace(1) %6, i64 %366, !dbg !108 + %1037 = getelementptr bfloat, ptr addrspace(1) %6, i64 %368, !dbg !108 + %1038 = getelementptr bfloat, ptr addrspace(1) %6, i64 %369, !dbg !108 + %1039 = getelementptr bfloat, ptr addrspace(1) %6, i64 %370, !dbg !108 + %1040 = getelementptr bfloat, ptr addrspace(1) %6, i64 %371, !dbg !108 + %1041 = getelementptr bfloat, ptr addrspace(1) %6, i64 %372, !dbg !108 + %1042 = getelementptr bfloat, ptr addrspace(1) %6, i64 %367, !dbg !108 + %1043 = getelementptr bfloat, ptr addrspace(1) %6, i64 %373, !dbg !108 + %1044 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %1045 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1036, i64 %1044, i1 %325) #6, !dbg !109 + %1046 = bitcast i16 %1045 to bfloat, !dbg !109 + %1047 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %1048 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1037, i64 %1047, i1 %325) #6, !dbg !109 + %1049 = bitcast i16 %1048 to bfloat, !dbg !109 + %1050 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %1051 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1038, i64 %1050, i1 %325) #6, !dbg !109 + %1052 = bitcast i16 %1051 to bfloat, !dbg !109 + %1053 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %1054 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1039, i64 %1053, i1 %325) #6, !dbg !109 + %1055 = bitcast i16 %1054 to bfloat, !dbg !109 + %1056 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %1057 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1040, i64 %1056, i1 %325) #6, !dbg !109 + %1058 = bitcast i16 %1057 to bfloat, !dbg !109 + %1059 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %1060 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1041, i64 %1059, i1 %325) #6, !dbg !109 + %1061 = bitcast i16 %1060 to bfloat, !dbg !109 + %1062 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %1063 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1042, i64 %1062, i1 %325) #6, !dbg !109 + %1064 = bitcast i16 %1063 to bfloat, !dbg !109 + %1065 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %1066 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1043, i64 %1065, i1 %325) #6, !dbg !109 + %1067 = bitcast i16 %1066 to bfloat, !dbg !109 + %1068 = fpext bfloat %1046 to float, !dbg !110 + %1069 = fpext bfloat %1049 to float, !dbg !110 + %1070 = fpext bfloat %1052 to float, !dbg !110 + %1071 = fpext bfloat %1055 to float, !dbg !110 + %1072 = fpext bfloat %1058 to float, !dbg !110 + %1073 = fpext bfloat %1061 to float, !dbg !110 + %1074 = fpext bfloat %1064 to float, !dbg !110 + %1075 = fpext bfloat %1067 to float, !dbg !110 + %1076 = fmul float %1028, %1068, !dbg !111 + %1077 = fmul float %1029, %1069, !dbg !111 + %1078 = fmul float %1030, %1070, !dbg !111 + %1079 = fmul float %1031, %1071, !dbg !111 + %1080 = fmul float %1032, %1072, !dbg !111 + %1081 = fmul float %1033, %1073, !dbg !111 + %1082 = fmul float %1034, %1074, !dbg !111 + %1083 = fmul float %1035, %1075, !dbg !111 + %1084 = select i1 %321, float %964, float %1076, !dbg !88 + %1085 = select i1 %321, float %965, float %1077, !dbg !88 + %1086 = select i1 %321, float %966, float %1078, !dbg !88 + %1087 = select i1 %321, float %967, float %1079, !dbg !88 + %1088 = select i1 %321, float %968, float %1080, !dbg !88 + %1089 = select i1 %321, float %969, float %1081, !dbg !88 + %1090 = select i1 %321, float %970, float %1082, !dbg !88 + %1091 = select i1 %321, float %971, float %1083, !dbg !88 + %1092 = fmul float %.0.i59, %505, !dbg !112 + %1093 = fmul float %.0.i59, %506, !dbg !112 + %1094 = fmul float %.0.i59, %507, !dbg !112 + %1095 = fmul float %.0.i59, %508, !dbg !112 + %1096 = fmul float %.0.i59, %509, !dbg !112 + %1097 = fmul float %.0.i59, %510, !dbg !112 + %1098 = fmul float %.0.i59, %511, !dbg !112 + %1099 = fmul float %.0.i59, %512, !dbg !112 + %1100 = fmul float %1092, %532, !dbg !113 + %1101 = fmul float %1093, %533, !dbg !113 + %1102 = fmul float %1094, %534, !dbg !113 + %1103 = fmul float %1095, %535, !dbg !113 + %1104 = fmul float %1096, %536, !dbg !113 + %1105 = fmul float %1097, %537, !dbg !113 + %1106 = fmul float %1098, %538, !dbg !113 + %1107 = fmul float %1099, %539, !dbg !113 + %1108 = fmul float %1100, %440, !dbg !114 + %1109 = fmul float %1101, %441, !dbg !114 + %1110 = fmul float %1102, %442, !dbg !114 + %1111 = fmul float %1103, %443, !dbg !114 + %1112 = fmul float %1104, %450, !dbg !114 + %1113 = fmul float %1105, %451, !dbg !114 + %1114 = fmul float %1106, %452, !dbg !114 + %1115 = fmul float %1107, %453, !dbg !114 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114 + store float %1108, ptr addrspace(3) %290, align 4, !dbg !114 + store float %1109, ptr addrspace(3) %291, align 4, !dbg !114 + store float %1110, ptr addrspace(3) %292, align 4, !dbg !114 + store float %1111, ptr addrspace(3) %293, align 4, !dbg !114 + store float %1112, ptr addrspace(3) %294, align 4, !dbg !114 + store float %1113, ptr addrspace(3) %295, align 4, !dbg !114 + store float %1114, ptr addrspace(3) %296, align 4, !dbg !114 + store float %1115, ptr addrspace(3) %297, align 4, !dbg !114 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114 + %1116 = load float, ptr addrspace(3) %306, align 4, !dbg !114 + %1117 = load float, ptr addrspace(3) %308, align 4, !dbg !114 + %1118 = load float, ptr addrspace(3) %310, align 4, !dbg !114 + %1119 = load float, ptr addrspace(3) %312, align 4, !dbg !114 + %1120 = load float, ptr addrspace(3) %314, align 4, !dbg !114 + %1121 = load float, ptr addrspace(3) %316, align 4, !dbg !114 + %1122 = load float, ptr addrspace(3) %318, align 4, !dbg !114 + %1123 = load float, ptr addrspace(3) %320, align 4, !dbg !114 + %1124 = fmul float %476, %1084, !dbg !115 + %1125 = fmul float %477, %1085, !dbg !115 + %1126 = fmul float %478, %1086, !dbg !115 + %1127 = fmul float %479, %1087, !dbg !115 + %1128 = fmul float %480, %1088, !dbg !115 + %1129 = fmul float %481, %1089, !dbg !115 + %1130 = fmul float %482, %1090, !dbg !115 + %1131 = fmul float %483, %1091, !dbg !115 + %1132 = fadd float %1124, %1116, !dbg !116 + %1133 = fadd float %1125, %1117, !dbg !116 + %1134 = fadd float %1126, %1118, !dbg !116 + %1135 = fadd float %1127, %1119, !dbg !116 + %1136 = fadd float %1128, %1120, !dbg !116 + %1137 = fadd float %1129, %1121, !dbg !116 + %1138 = fadd float %1130, %1122, !dbg !116 + %1139 = fadd float %1131, %1123, !dbg !116 + %1140 = or disjoint i64 %365, %361, !dbg !117 + %1141 = getelementptr bfloat, ptr addrspace(1) %0, i64 %1140, !dbg !118 + %1142 = fptrunc float %844 to bfloat, !dbg !119 + %1143 = fptrunc float %845 to bfloat, !dbg !119 + %1144 = fptrunc float %846 to bfloat, !dbg !119 + %1145 = fptrunc float %847 to bfloat, !dbg !119 + %1146 = fptrunc float %848 to bfloat, !dbg !119 + %1147 = fptrunc float %849 to bfloat, !dbg !119 + %1148 = fptrunc float %850 to bfloat, !dbg !119 + %1149 = fptrunc float %851 to bfloat, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + store bfloat %1142, ptr addrspace(3) %336, align 2, !dbg !119 + store bfloat %1143, ptr addrspace(3) %338, align 2, !dbg !119 + store bfloat %1144, ptr addrspace(3) %340, align 2, !dbg !119 + store bfloat %1145, ptr addrspace(3) %342, align 2, !dbg !119 + store bfloat %1146, ptr addrspace(3) %344, align 2, !dbg !119 + store bfloat %1147, ptr addrspace(3) %346, align 2, !dbg !119 + store bfloat %1148, ptr addrspace(3) %348, align 2, !dbg !119 + store bfloat %1149, ptr addrspace(3) %350, align 2, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + %1150 = load i32, ptr addrspace(3) %355, align 4, !dbg !119 + %1151 = load i32, ptr addrspace(3) %356, align 4, !dbg !119 + %1152 = load i32, ptr addrspace(3) %357, align 4, !dbg !119 + %1153 = load i32, ptr addrspace(3) %358, align 4, !dbg !119 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1150, i32 %1151, i32 %1152, i32 %1153, ptr addrspace(1) %1141, i1 true) #6, !dbg !119 + %1154 = getelementptr bfloat, ptr addrspace(1) %1, i64 %1140, !dbg !120 + %1155 = fptrunc float %1132 to bfloat, !dbg !121 + %1156 = fptrunc float %1133 to bfloat, !dbg !121 + %1157 = fptrunc float %1134 to bfloat, !dbg !121 + %1158 = fptrunc float %1135 to bfloat, !dbg !121 + %1159 = fptrunc float %1136 to bfloat, !dbg !121 + %1160 = fptrunc float %1137 to bfloat, !dbg !121 + %1161 = fptrunc float %1138 to bfloat, !dbg !121 + %1162 = fptrunc float %1139 to bfloat, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + store bfloat %1155, ptr addrspace(3) %336, align 2, !dbg !121 + store bfloat %1156, ptr addrspace(3) %338, align 2, !dbg !121 + store bfloat %1157, ptr addrspace(3) %340, align 2, !dbg !121 + store bfloat %1158, ptr addrspace(3) %342, align 2, !dbg !121 + store bfloat %1159, ptr addrspace(3) %344, align 2, !dbg !121 + store bfloat %1160, ptr addrspace(3) %346, align 2, !dbg !121 + store bfloat %1161, ptr addrspace(3) %348, align 2, !dbg !121 + store bfloat %1162, ptr addrspace(3) %350, align 2, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + %1163 = load i32, ptr addrspace(3) %355, align 4, !dbg !121 + %1164 = load i32, ptr addrspace(3) %356, align 4, !dbg !121 + %1165 = load i32, ptr addrspace(3) %357, align 4, !dbg !121 + %1166 = load i32, ptr addrspace(3) %358, align 4, !dbg !121 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1163, i32 %1164, i32 %1165, i32 %1166, ptr addrspace(1) %1154, i1 true) #6, !dbg !121 + br i1 %364, label %363, label %1167, !dbg !47 + +1167: ; preds = %363 + ret void, !dbg !122 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 23, column: 33, scope: !5) +!10 = !DILocation(line: 24, column: 44, scope: !5) +!11 = !DILocation(line: 24, column: 23, scope: !5) +!12 = !DILocation(line: 26, column: 37, scope: !5) +!13 = !DILocation(line: 29, column: 19, scope: !5) +!14 = !DILocation(line: 33, column: 43, scope: !5) +!15 = !DILocation(line: 39, column: 57, scope: !5) +!16 = !DILocation(line: 39, column: 34, scope: !5) +!17 = !DILocation(line: 39, column: 68, scope: !5) +!18 = !DILocation(line: 39, column: 121, scope: !5) +!19 = !DILocation(line: 40, column: 50, scope: !5) +!20 = !DILocation(line: 40, column: 34, scope: !5) +!21 = !DILocation(line: 40, column: 61, scope: !5) +!22 = !DILocation(line: 40, column: 114, scope: !5) +!23 = !DILocation(line: 42, column: 22, scope: !5) +!24 = !DILocation(line: 47, column: 22, scope: !5) +!25 = !DILocation(line: 34, column: 31, scope: !5) +!26 = !DILocation(line: 44, column: 23, scope: !5) +!27 = !DILocation(line: 49, column: 25, scope: !5) +!28 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !31) +!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0) +!30 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!31 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !32) +!32 = !DILocation(line: 51, column: 25, scope: !33) +!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!34 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !35) +!35 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !36) +!36 = !DILocation(line: 52, column: 27, scope: !33) +!37 = !DILocation(line: 63, column: 46, scope: !5) +!38 = !DILocation(line: 75, column: 25, scope: !5) +!39 = !DILocation(line: 77, column: 24, scope: !5) +!40 = !DILocation(line: 78, column: 32, scope: !5) +!41 = !DILocation(line: 79, column: 24, scope: !5) +!42 = !DILocation(line: 123, column: 24, scope: !5) +!43 = !DILocation(line: 124, column: 24, scope: !5) +!44 = !DILocation(line: 125, column: 32, scope: !5) +!45 = !DILocation(line: 126, column: 24, scope: !5) +!46 = !DILocation(line: 161, column: 43, scope: !5) +!47 = !DILocation(line: 53, column: 43, scope: !5) +!48 = !DILocation(line: 54, column: 31, scope: !5) +!49 = !DILocation(line: 72, column: 41, scope: !5) +!50 = !DILocation(line: 61, column: 51, scope: !5) +!51 = !DILocation(line: 61, column: 35, scope: !5) +!52 = !DILocation(line: 61, column: 62, scope: !5) +!53 = !DILocation(line: 61, column: 115, scope: !5) +!54 = !DILocation(line: 62, column: 35, scope: !5) +!55 = !DILocation(line: 62, column: 42, scope: !5) +!56 = !DILocation(line: 62, column: 95, scope: !5) +!57 = !DILocation(line: 63, column: 42, scope: !5) +!58 = !DILocation(line: 63, column: 35, scope: !5) +!59 = !DILocation(line: 63, column: 51, scope: !5) +!60 = !DILocation(line: 64, column: 35, scope: !5) +!61 = !DILocation(line: 64, column: 51, scope: !5) +!62 = !DILocation(line: 65, column: 58, scope: !5) +!63 = !DILocation(line: 65, column: 35, scope: !5) +!64 = !DILocation(line: 65, column: 69, scope: !5) +!65 = !DILocation(line: 65, column: 123, scope: !5) +!66 = !DILocation(line: 66, column: 36, scope: !5) +!67 = !DILocation(line: 66, column: 43, scope: !5) +!68 = !DILocation(line: 66, column: 96, scope: !5) +!69 = !DILocation(line: 72, column: 39, scope: !5) +!70 = !DILocation(line: 72, column: 57, scope: !5) +!71 = !DILocation(line: 72, column: 35, scope: !5) +!72 = !DILocation(line: 72, column: 68, scope: !5) +!73 = !DILocation(line: 72, column: 129, scope: !5) +!74 = !DILocation(line: 80, column: 35, scope: !5) +!75 = !DILocation(line: 80, column: 85, scope: !5) +!76 = !DILocation(line: 80, column: 146, scope: !5) +!77 = !DILocation(line: 82, column: 24, scope: !5) +!78 = !DILocation(line: 84, column: 17, scope: !5) +!79 = !DILocation(line: 90, column: 53, scope: !5) +!80 = !DILocation(line: 90, column: 35, scope: !5) +!81 = !DILocation(line: 90, column: 64, scope: !5) +!82 = !DILocation(line: 90, column: 125, scope: !5) +!83 = !DILocation(line: 97, column: 24, scope: !5) +!84 = !DILocation(line: 98, column: 35, scope: !5) +!85 = !DILocation(line: 98, column: 81, scope: !5) +!86 = !DILocation(line: 98, column: 142, scope: !5) +!87 = !DILocation(line: 100, column: 24, scope: !5) +!88 = !DILocation(line: 0, scope: !5) +!89 = !DILocation(line: 111, column: 24, scope: !5) +!90 = !DILocation(line: 113, column: 24, scope: !5) +!91 = !DILocation(line: 116, column: 24, scope: !5) +!92 = !DILocation(line: 118, column: 24, scope: !5) +!93 = !DILocation(line: 119, column: 24, scope: !5) +!94 = !DILocation(line: 121, column: 60, scope: !5) +!95 = !DILocation(line: 121, column: 35, scope: !5) +!96 = !DILocation(line: 121, column: 71, scope: !5) +!97 = !DILocation(line: 121, column: 132, scope: !5) +!98 = !DILocation(line: 127, column: 35, scope: !5) +!99 = !DILocation(line: 127, column: 85, scope: !5) +!100 = !DILocation(line: 127, column: 146, scope: !5) +!101 = !DILocation(line: 129, column: 24, scope: !5) +!102 = !DILocation(line: 131, column: 17, scope: !5) +!103 = !DILocation(line: 134, column: 60, scope: !5) +!104 = !DILocation(line: 134, column: 35, scope: !5) +!105 = !DILocation(line: 134, column: 71, scope: !5) +!106 = !DILocation(line: 134, column: 132, scope: !5) +!107 = !DILocation(line: 139, column: 24, scope: !5) +!108 = !DILocation(line: 140, column: 35, scope: !5) +!109 = !DILocation(line: 140, column: 81, scope: !5) +!110 = !DILocation(line: 140, column: 142, scope: !5) +!111 = !DILocation(line: 142, column: 24, scope: !5) +!112 = !DILocation(line: 151, column: 25, scope: !5) +!113 = !DILocation(line: 153, column: 26, scope: !5) +!114 = !DILocation(line: 156, column: 26, scope: !5) +!115 = !DILocation(line: 158, column: 26, scope: !5) +!116 = !DILocation(line: 159, column: 26, scope: !5) +!117 = !DILocation(line: 161, column: 39, scope: !5) +!118 = !DILocation(line: 161, column: 32, scope: !5) +!119 = !DILocation(line: 161, column: 55, scope: !5) +!120 = !DILocation(line: 162, column: 32, scope: !5) +!121 = !DILocation(line: 162, column: 56, scope: !5) +!122 = !DILocation(line: 53, column: 4, scope: !5) diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..6ba16afc7c5c89773bb586712c820d45b1a4eb19 --- /dev/null +++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx @@ -0,0 +1,2014 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10 +) +.reqntid 512 +{ + .reg .pred %p<6>; + .reg .b16 %rs<146>; + .reg .b32 %r<543>; + .reg .b64 %rd<201>; + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6]; + ld.param.b64 %rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5]; + ld.param.b64 %rd9, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4]; + ld.param.b64 %rd8, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3]; + ld.param.b64 %rd7, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2]; + ld.param.b64 %rd6, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1]; + ld.param.b64 %rd5, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28 + mov.u32 %r47, %ctaid.x; + .loc 1 23 33 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33 + shl.b32 %r48, %r47, 6; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + mov.u32 %r49, %tid.x; + and.b32 %r50, %r49, 504; + bfe.u32 %r51, %r49, 3, 6; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r52, %r51, %r48; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + and.b32 %r53, %r49, 7; + shl.b32 %r54, %r53, 3; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + bfe.s32 %r55, %r47, 25, 1; + shr.u32 %r56, %r55, 27; + add.s32 %r57, %r52, %r56; + shr.s32 %r58, %r57, 5; + shl.b32 %r59, %r52, 7; + shl.b32 %r60, %r58, 15; + add.s32 %r1, %r60, %r59; + add.s32 %r2, %r1, 4096; + .loc 1 33 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43 + cvt.u64.u32 %rd1, %r54; + .loc 1 39 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57 + or.b32 %r61, %r2, %r54; + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + mad.wide.s32 %rd12, %r61, 2, %rd7; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0; + // end inline asm + mov.b32 %r34, 0; + mov.pred %p2, -1; + // begin inline asm + mov.u32 %r30, %r34; + mov.u32 %r31, %r34; + mov.u32 %r32, %r34; + mov.u32 %r33, %r34; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd12 + 0 ], %rd13; + // end inline asm + mov.b32 {%rs1, %rs2}, %r30; + mov.b32 {%rs3, %rs4}, %r31; + mov.b32 {%rs5, %rs6}, %r32; + mov.b32 {%rs7, %rs8}, %r33; + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r62, %rs1; + cvt.f32.bf16 %r63, %rs2; + cvt.f32.bf16 %r64, %rs3; + cvt.f32.bf16 %r65, %rs4; + cvt.f32.bf16 %r66, %rs5; + cvt.f32.bf16 %r67, %rs6; + cvt.f32.bf16 %r68, %rs7; + cvt.f32.bf16 %r69, %rs8; + .loc 1 40 50 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:50 + or.b32 %r70, %r1, %r54; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + mad.wide.s32 %rd14, %r70, 2, %rd7; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r35, %r34; + mov.u32 %r36, %r34; + mov.u32 %r37, %r34; + mov.u32 %r38, %r34; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd14 + 0 ], %rd15; + // end inline asm + mov.b32 {%rs9, %rs10}, %r35; + mov.b32 {%rs11, %rs12}, %r36; + mov.b32 {%rs13, %rs14}, %r37; + mov.b32 {%rs15, %rs16}, %r38; + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r71, %rs9; + cvt.f32.bf16 %r72, %rs10; + cvt.f32.bf16 %r73, %rs11; + cvt.f32.bf16 %r74, %rs12; + cvt.f32.bf16 %r75, %rs13; + cvt.f32.bf16 %r76, %rs14; + cvt.f32.bf16 %r77, %rs15; + cvt.f32.bf16 %r78, %rs16; + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + cvt.s64.s32 %rd20, %r2; + or.b64 %rd21, %rd20, %rd1; + shl.b64 %rd22, %rd21, 1; + add.s64 %rd23, %rd7, %rd22; + add.s64 %rd16, %rd23, 128; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r39, %r34; + mov.u32 %r40, %r34; + mov.u32 %r41, %r34; + mov.u32 %r42, %r34; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r39, %r40, %r41, %r42 }, [ %rd16 + 0 ], %rd17; + // end inline asm + mov.b32 {%rs17, %rs18}, %r39; + mov.b32 {%rs19, %rs20}, %r40; + mov.b32 {%rs21, %rs22}, %r41; + mov.b32 {%rs23, %rs24}, %r42; + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r79, %rs17; + cvt.f32.bf16 %r80, %rs18; + cvt.f32.bf16 %r81, %rs19; + cvt.f32.bf16 %r82, %rs20; + cvt.f32.bf16 %r83, %rs21; + cvt.f32.bf16 %r84, %rs22; + cvt.f32.bf16 %r85, %rs23; + cvt.f32.bf16 %r86, %rs24; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + cvt.s64.s32 %rd24, %r1; + or.b64 %rd25, %rd24, %rd1; + shl.b64 %rd26, %rd25, 1; + add.s64 %rd27, %rd7, %rd26; + add.s64 %rd18, %rd27, 128; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r43, %r34; + mov.u32 %r44, %r34; + mov.u32 %r45, %r34; + mov.u32 %r46, %r34; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r43, %r44, %r45, %r46 }, [ %rd18 + 0 ], %rd19; + // end inline asm + mov.b32 {%rs25, %rs26}, %r43; + mov.b32 {%rs27, %rs28}, %r44; + mov.b32 {%rs29, %rs30}, %r45; + mov.b32 {%rs31, %rs32}, %r46; + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r87, %rs25; + cvt.f32.bf16 %r88, %rs26; + cvt.f32.bf16 %r89, %rs27; + cvt.f32.bf16 %r90, %rs28; + cvt.f32.bf16 %r91, %rs29; + cvt.f32.bf16 %r92, %rs30; + cvt.f32.bf16 %r93, %rs31; + cvt.f32.bf16 %r94, %rs32; + .loc 1 42 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22 + mul.f32 %r95, %r79, %r79; + mul.f32 %r96, %r80, %r80; + mul.f32 %r97, %r81, %r81; + mul.f32 %r98, %r82, %r82; + mul.f32 %r99, %r83, %r83; + mul.f32 %r100, %r84, %r84; + mul.f32 %r101, %r85, %r85; + mul.f32 %r102, %r86, %r86; + .loc 1 44 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:44:23 + fma.rn.f32 %r103, %r62, %r62, %r95; + fma.rn.f32 %r104, %r63, %r63, %r96; + fma.rn.f32 %r105, %r64, %r64, %r97; + fma.rn.f32 %r106, %r65, %r65, %r98; + fma.rn.f32 %r107, %r66, %r66, %r99; + fma.rn.f32 %r108, %r67, %r67, %r100; + fma.rn.f32 %r109, %r68, %r68, %r101; + fma.rn.f32 %r110, %r69, %r69, %r102; + .loc 1 47 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22 + mul.f32 %r111, %r87, %r87; + mul.f32 %r112, %r88, %r88; + mul.f32 %r113, %r89, %r89; + mul.f32 %r114, %r90, %r90; + mul.f32 %r115, %r91, %r91; + mul.f32 %r116, %r92, %r92; + mul.f32 %r117, %r93, %r93; + mul.f32 %r118, %r94, %r94; + .loc 1 49 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25 + fma.rn.f32 %r119, %r71, %r71, %r111; + fma.rn.f32 %r120, %r72, %r72, %r112; + fma.rn.f32 %r121, %r73, %r73, %r113; + fma.rn.f32 %r122, %r74, %r74, %r114; + fma.rn.f32 %r123, %r75, %r75, %r115; + fma.rn.f32 %r124, %r76, %r76, %r116; + fma.rn.f32 %r125, %r77, %r77, %r117; + fma.rn.f32 %r126, %r78, %r78, %r118; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + and.b32 %r127, %r49, 63; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r128, %r48, %r127; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + shr.u32 %r129, %r49, 6; + and.b32 %r130, %r129, 6; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + add.s32 %r131, %r128, %r56; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r132, %r103, %r104; + add.f32 %r133, %r105, %r132; + add.f32 %r134, %r106, %r133; + add.f32 %r135, %r107, %r134; + add.f32 %r136, %r108, %r135; + add.f32 %r137, %r109, %r136; + add.f32 %r138, %r110, %r137; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r139, %r138, 4, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r140, %r138, %r139; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r141, %r140, 2, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r142, %r140, %r141; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r143, %r142, 1, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r144, %r142, %r143; +$L__tmp8: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r145, %r119, %r120; + add.f32 %r146, %r121, %r145; + add.f32 %r147, %r122, %r146; + add.f32 %r148, %r123, %r147; + add.f32 %r149, %r124, %r148; + add.f32 %r150, %r125, %r149; + add.f32 %r151, %r126, %r150; +$L__tmp9: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r152, %r151, 4, 31, -1; +$L__tmp10: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r153, %r151, %r152; +$L__tmp11: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r154, %r153, 2, 31, -1; +$L__tmp12: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r155, %r153, %r154; +$L__tmp13: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r156, %r155, 1, 31, -1; +$L__tmp14: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r157, %r155, %r156; +$L__tmp15: + .loc 1 63 46 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46 + shl.b32 %r158, %r58, 7; + mov.b32 %r159, 0f43000000; + .loc 1 75 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25 + div.full.f32 %r160, %r157, %r159; + .loc 1 77 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24 + add.f32 %r161, %r160, 0f358637BD; + .loc 1 78 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32 + rsqrt.approx.ftz.f32 %r3, %r161; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + shr.u32 %r162, %r50, 1; + mov.b32 %r163, global_smem; + add.s32 %r164, %r163, %r162; + st.shared.b32 [%r164], %r3; + bar.sync 0; + shl.b32 %r165, %r127, 2; + add.s32 %r166, %r163, %r165; + ld.shared.b32 %r4, [%r166]; + .loc 1 123 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24 + div.full.f32 %r167, %r144, %r159; + .loc 1 124 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24 + add.f32 %r168, %r167, 0f358637BD; + .loc 1 125 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32 + rsqrt.approx.ftz.f32 %r5, %r168; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + bar.sync 0; + st.shared.b32 [%r164], %r5; + bar.sync 0; + ld.shared.b32 %r6, [%r166]; + bfe.u32 %r7, %r49, 6, 1; + or.b32 %r8, %r54, %r158; + and.b32 %r169, %r49, 224; + shl.b32 %r170, %r169, 6; + shl.b32 %r171, %r49, 2; + and.b32 %r172, %r171, 124; + shr.u32 %r173, %r169, 3; + shr.u32 %r174, %r49, 1; + and.b32 %r175, %r174, 128; + or.b32 %r176, %r170, %r172; + xor.b32 %r177, %r176, %r173; + add.s32 %r178, %r163, %r175; + add.s32 %r9, %r178, %r177; + and.b32 %r179, %r49, 28; + shl.b32 %r180, %r179, 9; + shl.b32 %r181, %r49, 5; + and.b32 %r182, %r181, 96; + and.b32 %r183, %r171, 1920; + or.b32 %r184, %r180, %r182; + or.b32 %r185, %r184, %r183; + or.b32 %r186, %r185, %r179; + add.s32 %r10, %r163, %r186; + xor.b32 %r187, %r186, 4; + add.s32 %r11, %r163, %r187; + xor.b32 %r188, %r186, 8; + add.s32 %r12, %r163, %r188; + xor.b32 %r189, %r186, 12; + add.s32 %r13, %r163, %r189; + xor.b32 %r190, %r186, 16; + add.s32 %r14, %r163, %r190; + xor.b32 %r191, %r186, 20; + add.s32 %r15, %r163, %r191; + xor.b32 %r192, %r186, 24; + add.s32 %r16, %r163, %r192; + xor.b32 %r193, %r186, 28; + add.s32 %r17, %r163, %r193; + shl.b32 %r194, %r128, 7; + shl.b32 %r195, %r131, 10; + and.b32 %r196, %r195, -32768; + add.s32 %r18, %r196, %r194; + add.s32 %r19, %r18, 4097; + add.s32 %r20, %r18, 4096; + shl.b32 %r197, %r179, 8; + shl.b32 %r198, %r49, 1; + and.b32 %r199, %r198, 768; + shr.u32 %r200, %r49, 5; + and.b32 %r201, %r200, 2; + or.b32 %r202, %r199, %r201; + or.b32 %r203, %r202, %r197; + or.b32 %r204, %r203, %r165; + add.s32 %r21, %r163, %r204; + xor.b32 %r205, %r204, 16; + add.s32 %r22, %r163, %r205; + xor.b32 %r206, %r204, 32; + add.s32 %r23, %r163, %r206; + xor.b32 %r207, %r204, 48; + add.s32 %r24, %r163, %r207; + xor.b32 %r208, %r204, 64; + add.s32 %r25, %r163, %r208; + xor.b32 %r209, %r204, 80; + add.s32 %r26, %r163, %r209; + xor.b32 %r210, %r204, 96; + add.s32 %r27, %r163, %r210; + xor.b32 %r211, %r204, 112; + add.s32 %r28, %r163, %r211; + shl.b32 %r212, %r169, 5; + shl.b32 %r213, %r53, 4; + or.b32 %r214, %r212, %r213; + xor.b32 %r215, %r214, %r162; + add.s32 %r29, %r163, %r215; + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + cvt.u64.u32 %rd2, %r130; + cvt.s64.s32 %rd3, %r158; + cvt.s64.s32 %rd4, %r59; + mov.b64 %rd200, 0; + mov.pred %p5, %p2; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 0 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43 + mov.pred %p1, %p5; + setp.ne.b32 %p4, %r7, 0; + setp.eq.b32 %p3, %r7, 0; + .loc 1 54 31 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:54:31 + or.b64 %rd174, %rd200, %rd1; + or.b64 %rd175, %rd200, %rd2; + .loc 1 61 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:51 + cvt.u32.u64 %r256, %rd174; + or.b32 %r257, %r1, %r256; + .loc 1 61 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35 + mad.wide.s32 %rd29, %r257, 2, %rd7; + .loc 1 61 62 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62 + // begin inline asm + mov.u64 %rd28, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd28, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r216, %r34; + mov.u32 %r217, %r34; + mov.u32 %r218, %r34; + mov.u32 %r219, %r34; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r216, %r217, %r218, %r219 }, [ %rd29 + 0 ], %rd28; + // end inline asm + mov.b32 {%rs98, %rs99}, %r216; + mov.b32 {%rs100, %rs101}, %r217; + mov.b32 {%rs102, %rs103}, %r218; + mov.b32 {%rs104, %rs105}, %r219; + .loc 1 61 115 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115 + cvt.f32.bf16 %r258, %rs98; + cvt.f32.bf16 %r259, %rs99; + cvt.f32.bf16 %r260, %rs100; + cvt.f32.bf16 %r261, %rs101; + cvt.f32.bf16 %r262, %rs102; + cvt.f32.bf16 %r263, %rs103; + cvt.f32.bf16 %r264, %rs104; + cvt.f32.bf16 %r265, %rs105; + .loc 1 62 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35 + shl.b64 %rd176, %rd174, 1; + add.s64 %rd31, %rd8, %rd176; + .loc 1 62 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42 + // begin inline asm + mov.u64 %rd30, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd30, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r220, %r34; + mov.u32 %r221, %r34; + mov.u32 %r222, %r34; + mov.u32 %r223, %r34; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r220, %r221, %r222, %r223 }, [ %rd31 + 0 ], %rd30; + // end inline asm + mov.b32 {%rs106, %rs107}, %r220; + mov.b32 {%rs108, %rs109}, %r221; + mov.b32 {%rs110, %rs111}, %r222; + mov.b32 {%rs112, %rs113}, %r223; + .loc 1 62 95 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95 + cvt.f32.bf16 %r266, %rs106; + cvt.f32.bf16 %r267, %rs107; + cvt.f32.bf16 %r268, %rs108; + cvt.f32.bf16 %r269, %rs109; + cvt.f32.bf16 %r270, %rs110; + cvt.f32.bf16 %r271, %rs111; + cvt.f32.bf16 %r272, %rs112; + cvt.f32.bf16 %r273, %rs113; + .loc 1 63 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42 + or.b64 %rd177, %rd174, %rd3; + .loc 1 63 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35 + shl.b64 %rd178, %rd177, 2; + add.s64 %rd33, %rd9, %rd178; + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + cvt.s64.s32 %rd179, %r8; + add.s64 %rd180, %rd200, %rd179; + .loc 1 63 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35 + shl.b64 %rd181, %rd180, 2; + add.s64 %rd182, %rd9, %rd181; + add.s64 %rd35, %rd182, 16; + .loc 1 63 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51 + // begin inline asm + mov.u64 %rd32, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd32, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r224, %r34; + mov.u32 %r225, %r34; + mov.u32 %r226, %r34; + mov.u32 %r227, %r34; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r224, %r225, %r226, %r227 }, [ %rd33 + 0 ], %rd32; + // end inline asm + // begin inline asm + mov.u64 %rd34, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd34, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r228, %r34; + mov.u32 %r229, %r34; + mov.u32 %r230, %r34; + mov.u32 %r231, %r34; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r228, %r229, %r230, %r231 }, [ %rd35 + 0 ], %rd34; + // end inline asm + .loc 1 64 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35 + add.s64 %rd37, %rd10, %rd178; + add.s64 %rd183, %rd10, %rd181; + add.s64 %rd39, %rd183, 16; + .loc 1 64 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51 + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd36, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r232, %r34; + mov.u32 %r233, %r34; + mov.u32 %r234, %r34; + mov.u32 %r235, %r34; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r232, %r233, %r234, %r235 }, [ %rd37 + 0 ], %rd36; + // end inline asm + // begin inline asm + mov.u64 %rd38, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd38, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r236, %r34; + mov.u32 %r237, %r34; + mov.u32 %r238, %r34; + mov.u32 %r239, %r34; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r236, %r237, %r238, %r239 }, [ %rd39 + 0 ], %rd38; + // end inline asm + bar.sync 0; + st.shared.b32 [%r9], %r232; + st.shared.b32 [%r9+256], %r233; + st.shared.b32 [%r9+512], %r234; + st.shared.b32 [%r9+768], %r235; + st.shared.b32 [%r9+1024], %r236; + st.shared.b32 [%r9+1280], %r237; + st.shared.b32 [%r9+1536], %r238; + st.shared.b32 [%r9+1792], %r239; + bar.sync 0; + ld.shared.b32 %r274, [%r10]; + ld.shared.b32 %r275, [%r11]; + ld.shared.b32 %r276, [%r12]; + ld.shared.b32 %r277, [%r13]; + ld.shared.b32 %r278, [%r14]; + ld.shared.b32 %r279, [%r15]; + ld.shared.b32 %r280, [%r16]; + ld.shared.b32 %r281, [%r17]; + .loc 1 65 58 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:58 + or.b32 %r282, %r2, %r256; + .loc 1 65 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35 + mad.wide.s32 %rd41, %r282, 2, %rd7; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + // begin inline asm + mov.u64 %rd40, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd40, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r240, %r34; + mov.u32 %r241, %r34; + mov.u32 %r242, %r34; + mov.u32 %r243, %r34; + @%p2 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r240, %r241, %r242, %r243 }, [ %rd41 + 0 ], %rd40; + // end inline asm + mov.b32 {%rs114, %rs115}, %r240; + mov.b32 {%rs116, %rs117}, %r241; + mov.b32 {%rs118, %rs119}, %r242; + mov.b32 {%rs120, %rs121}, %r243; + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r283, %rs114; + cvt.f32.bf16 %r284, %rs115; + cvt.f32.bf16 %r285, %rs116; + cvt.f32.bf16 %r286, %rs117; + cvt.f32.bf16 %r287, %rs118; + cvt.f32.bf16 %r288, %rs119; + cvt.f32.bf16 %r289, %rs120; + cvt.f32.bf16 %r290, %rs121; + .loc 1 66 36 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36 + add.s64 %rd43, %rd11, %rd176; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r244, %r34; + mov.u32 %r245, %r34; + mov.u32 %r246, %r34; + mov.u32 %r247, %r34; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r244, %r245, %r246, %r247 }, [ %rd43 + 0 ], %rd42; + // end inline asm + mov.b32 {%rs122, %rs123}, %r244; + mov.b32 {%rs124, %rs125}, %r245; + mov.b32 {%rs126, %rs127}, %r246; + mov.b32 {%rs128, %rs129}, %r247; + .loc 1 66 96 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96 + cvt.f32.bf16 %r291, %rs122; + cvt.f32.bf16 %r292, %rs123; + cvt.f32.bf16 %r293, %rs124; + cvt.f32.bf16 %r294, %rs125; + cvt.f32.bf16 %r295, %rs126; + cvt.f32.bf16 %r296, %rs127; + cvt.f32.bf16 %r297, %rs128; + cvt.f32.bf16 %r298, %rs129; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + cvt.s64.s32 %rd184, %r18; + .loc 1 72 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:57 + cvt.u32.u64 %r299, %rd175; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + cvt.s64.s32 %rd185, %rd175; + add.s64 %rd186, %rd184, %rd185; + shl.b64 %rd187, %rd186, 1; + add.s64 %rd188, %rd7, %rd187; + add.s64 %rd45, %rd188, 2; + add.s64 %rd47, %rd188, 18; + add.s64 %rd49, %rd188, 34; + add.s64 %rd51, %rd188, 50; + add.s64 %rd53, %rd188, 66; + add.s64 %rd55, %rd188, 82; + add.s64 %rd57, %rd188, 98; + add.s64 %rd59, %rd188, 114; + .loc 1 72 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68 + // begin inline asm + mov.u64 %rd44, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd44, 1.0; + // end inline asm + mov.b16 %rs34, 0; + // begin inline asm + mov.u16 %rs33, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd45 + 0 ], %rd44; + // end inline asm + // begin inline asm + mov.u64 %rd46, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd46, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs35, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs35 }, [ %rd47 + 0 ], %rd46; + // end inline asm + // begin inline asm + mov.u64 %rd48, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd48, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs36, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs36 }, [ %rd49 + 0 ], %rd48; + // end inline asm + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs37, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs37 }, [ %rd51 + 0 ], %rd50; + // end inline asm + // begin inline asm + mov.u64 %rd52, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd52, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs38, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs38 }, [ %rd53 + 0 ], %rd52; + // end inline asm + // begin inline asm + mov.u64 %rd54, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd54, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs39, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs39 }, [ %rd55 + 0 ], %rd54; + // end inline asm + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs40, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs40 }, [ %rd57 + 0 ], %rd56; + // end inline asm + // begin inline asm + mov.u64 %rd58, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd58, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs41, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs41 }, [ %rd59 + 0 ], %rd58; + // end inline asm + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r300, %rs33; + cvt.f32.bf16 %r301, %rs35; + cvt.f32.bf16 %r302, %rs36; + cvt.f32.bf16 %r303, %rs37; + cvt.f32.bf16 %r304, %rs38; + cvt.f32.bf16 %r305, %rs39; + cvt.f32.bf16 %r306, %rs40; + cvt.f32.bf16 %r307, %rs41; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r308, %r4, %r300; + mul.f32 %r309, %r4, %r301; + mul.f32 %r310, %r4, %r302; + mul.f32 %r311, %r4, %r303; + mul.f32 %r312, %r4, %r304; + mul.f32 %r313, %r4, %r305; + mul.f32 %r314, %r4, %r306; + mul.f32 %r315, %r4, %r307; + .loc 1 80 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35 + shl.b64 %rd189, %rd175, 1; + add.s64 %rd93, %rd8, %rd189; + add.s64 %rd61, %rd93, 2; + add.s64 %rd63, %rd93, 18; + add.s64 %rd65, %rd93, 34; + add.s64 %rd67, %rd93, 50; + add.s64 %rd69, %rd93, 66; + add.s64 %rd71, %rd93, 82; + add.s64 %rd73, %rd93, 98; + add.s64 %rd75, %rd93, 114; + .loc 1 80 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85 + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs42, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs42 }, [ %rd61 + 0 ], %rd60; + // end inline asm + // begin inline asm + mov.u64 %rd62, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd62, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs43, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs43 }, [ %rd63 + 0 ], %rd62; + // end inline asm + // begin inline asm + mov.u64 %rd64, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs44, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs44 }, [ %rd65 + 0 ], %rd64; + // end inline asm + // begin inline asm + mov.u64 %rd66, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd66, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs45, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs45 }, [ %rd67 + 0 ], %rd66; + // end inline asm + // begin inline asm + mov.u64 %rd68, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs46, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs46 }, [ %rd69 + 0 ], %rd68; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs47, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs47 }, [ %rd71 + 0 ], %rd70; + // end inline asm + // begin inline asm + mov.u64 %rd72, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd72, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs48, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs48 }, [ %rd73 + 0 ], %rd72; + // end inline asm + // begin inline asm + mov.u64 %rd74, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd74, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs49, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs49 }, [ %rd75 + 0 ], %rd74; + // end inline asm + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r316, %rs42; + cvt.f32.bf16 %r317, %rs43; + cvt.f32.bf16 %r318, %rs44; + cvt.f32.bf16 %r319, %rs45; + cvt.f32.bf16 %r320, %rs46; + cvt.f32.bf16 %r321, %rs47; + cvt.f32.bf16 %r322, %rs48; + cvt.f32.bf16 %r323, %rs49; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r324, %r308; + fma.rn.f32 %r325, %r324, %r316, 0f00000000; + neg.f32 %r326, %r309; + fma.rn.f32 %r327, %r326, %r317, 0f00000000; + neg.f32 %r328, %r310; + fma.rn.f32 %r329, %r328, %r318, 0f00000000; + neg.f32 %r330, %r311; + fma.rn.f32 %r331, %r330, %r319, 0f00000000; + neg.f32 %r332, %r312; + fma.rn.f32 %r333, %r332, %r320, 0f00000000; + neg.f32 %r334, %r313; + fma.rn.f32 %r335, %r334, %r321, 0f00000000; + neg.f32 %r336, %r314; + fma.rn.f32 %r337, %r336, %r322, 0f00000000; + neg.f32 %r338, %r315; + fma.rn.f32 %r339, %r338, %r323, 0f00000000; + .loc 1 90 53 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53 + or.b32 %r340, %r18, %r299; + .loc 1 90 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35 + mad.wide.s32 %rd77, %r340, 2, %rd7; + add.s64 %rd79, %rd188, 16; + add.s64 %rd81, %rd188, 32; + add.s64 %rd83, %rd188, 48; + add.s64 %rd85, %rd188, 64; + add.s64 %rd87, %rd188, 80; + add.s64 %rd89, %rd188, 96; + add.s64 %rd91, %rd188, 112; + .loc 1 90 64 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64 + // begin inline asm + mov.u64 %rd76, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd76, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs50, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs50 }, [ %rd77 + 0 ], %rd76; + // end inline asm + // begin inline asm + mov.u64 %rd78, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd78, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs51, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs51 }, [ %rd79 + 0 ], %rd78; + // end inline asm + // begin inline asm + mov.u64 %rd80, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd80, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs52, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs52 }, [ %rd81 + 0 ], %rd80; + // end inline asm + // begin inline asm + mov.u64 %rd82, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd82, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs53, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs53 }, [ %rd83 + 0 ], %rd82; + // end inline asm + // begin inline asm + mov.u64 %rd84, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd84, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs54, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs54 }, [ %rd85 + 0 ], %rd84; + // end inline asm + // begin inline asm + mov.u64 %rd86, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd86, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs55, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs55 }, [ %rd87 + 0 ], %rd86; + // end inline asm + // begin inline asm + mov.u64 %rd88, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd88, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs56, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs56 }, [ %rd89 + 0 ], %rd88; + // end inline asm + // begin inline asm + mov.u64 %rd90, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd90, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs57, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs57 }, [ %rd91 + 0 ], %rd90; + // end inline asm + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r341, %rs50; + cvt.f32.bf16 %r342, %rs51; + cvt.f32.bf16 %r343, %rs52; + cvt.f32.bf16 %r344, %rs53; + cvt.f32.bf16 %r345, %rs54; + cvt.f32.bf16 %r346, %rs55; + cvt.f32.bf16 %r347, %rs56; + cvt.f32.bf16 %r348, %rs57; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r349, %r4, %r341; + mul.f32 %r350, %r4, %r342; + mul.f32 %r351, %r4, %r343; + mul.f32 %r352, %r4, %r344; + mul.f32 %r353, %r4, %r345; + mul.f32 %r354, %r4, %r346; + mul.f32 %r355, %r4, %r347; + mul.f32 %r356, %r4, %r348; + .loc 1 98 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35 + add.s64 %rd95, %rd93, 16; + add.s64 %rd97, %rd93, 32; + add.s64 %rd99, %rd93, 48; + add.s64 %rd101, %rd93, 64; + add.s64 %rd103, %rd93, 80; + add.s64 %rd105, %rd93, 96; + add.s64 %rd107, %rd93, 112; + .loc 1 98 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81 + // begin inline asm + mov.u64 %rd92, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd92, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs58, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs58 }, [ %rd93 + 0 ], %rd92; + // end inline asm + // begin inline asm + mov.u64 %rd94, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd94, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs59, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs59 }, [ %rd95 + 0 ], %rd94; + // end inline asm + // begin inline asm + mov.u64 %rd96, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd96, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs60, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs60 }, [ %rd97 + 0 ], %rd96; + // end inline asm + // begin inline asm + mov.u64 %rd98, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd98, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs61, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs61 }, [ %rd99 + 0 ], %rd98; + // end inline asm + // begin inline asm + mov.u64 %rd100, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd100, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs62, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs62 }, [ %rd101 + 0 ], %rd100; + // end inline asm + // begin inline asm + mov.u64 %rd102, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd102, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs63, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs63 }, [ %rd103 + 0 ], %rd102; + // end inline asm + // begin inline asm + mov.u64 %rd104, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd104, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs64, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs64 }, [ %rd105 + 0 ], %rd104; + // end inline asm + // begin inline asm + mov.u64 %rd106, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd106, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs65, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs65 }, [ %rd107 + 0 ], %rd106; + // end inline asm + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r357, %rs58; + cvt.f32.bf16 %r358, %rs59; + cvt.f32.bf16 %r359, %rs60; + cvt.f32.bf16 %r360, %rs61; + cvt.f32.bf16 %r361, %rs62; + cvt.f32.bf16 %r362, %rs63; + cvt.f32.bf16 %r363, %rs64; + cvt.f32.bf16 %r364, %rs65; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r365, %r349, %r357; + mul.f32 %r366, %r350, %r358; + mul.f32 %r367, %r351, %r359; + mul.f32 %r368, %r352, %r360; + mul.f32 %r369, %r353, %r361; + mul.f32 %r370, %r354, %r362; + mul.f32 %r371, %r355, %r363; + mul.f32 %r372, %r356, %r364; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r373, %r325, %r365, %p3; + selp.f32 %r374, %r327, %r366, %p3; + selp.f32 %r375, %r329, %r367, %p3; + selp.f32 %r376, %r331, %r368, %p3; + selp.f32 %r377, %r333, %r369, %p3; + selp.f32 %r378, %r335, %r370, %p3; + selp.f32 %r379, %r337, %r371, %p3; + selp.f32 %r380, %r339, %r372, %p3; + .loc 1 111 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24 + mul.f32 %r381, %r3, %r258; + mul.f32 %r382, %r3, %r259; + mul.f32 %r383, %r3, %r260; + mul.f32 %r384, %r3, %r261; + mul.f32 %r385, %r3, %r262; + mul.f32 %r386, %r3, %r263; + mul.f32 %r387, %r3, %r264; + mul.f32 %r388, %r3, %r265; + .loc 1 113 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24 + mul.f32 %r389, %r381, %r266; + mul.f32 %r390, %r382, %r267; + mul.f32 %r391, %r383, %r268; + mul.f32 %r392, %r384, %r269; + mul.f32 %r393, %r385, %r270; + mul.f32 %r394, %r386, %r271; + mul.f32 %r395, %r387, %r272; + mul.f32 %r396, %r388, %r273; + .loc 1 116 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24 + mul.f32 %r397, %r389, %r224; + mul.f32 %r398, %r390, %r225; + mul.f32 %r399, %r391, %r226; + mul.f32 %r400, %r392, %r227; + mul.f32 %r401, %r393, %r228; + mul.f32 %r402, %r394, %r229; + mul.f32 %r403, %r395, %r230; + mul.f32 %r404, %r396, %r231; + bar.sync 0; + st.shared.b32 [%r9], %r397; + st.shared.b32 [%r9+256], %r398; + st.shared.b32 [%r9+512], %r399; + st.shared.b32 [%r9+768], %r400; + st.shared.b32 [%r9+1024], %r401; + st.shared.b32 [%r9+1280], %r402; + st.shared.b32 [%r9+1536], %r403; + st.shared.b32 [%r9+1792], %r404; + bar.sync 0; + ld.shared.b32 %r405, [%r10]; + ld.shared.b32 %r406, [%r11]; + ld.shared.b32 %r407, [%r12]; + ld.shared.b32 %r408, [%r13]; + ld.shared.b32 %r409, [%r14]; + ld.shared.b32 %r410, [%r15]; + ld.shared.b32 %r411, [%r16]; + ld.shared.b32 %r412, [%r17]; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r413, %r274, %r373, %r405; + fma.rn.f32 %r414, %r275, %r374, %r406; + fma.rn.f32 %r415, %r276, %r375, %r407; + fma.rn.f32 %r416, %r277, %r376, %r408; + fma.rn.f32 %r417, %r278, %r377, %r409; + fma.rn.f32 %r418, %r279, %r378, %r410; + fma.rn.f32 %r419, %r280, %r379, %r411; + fma.rn.f32 %r420, %r281, %r380, %r412; + .loc 1 121 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60 + or.b32 %r421, %r19, %r299; + .loc 1 121 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35 + mad.wide.s32 %rd109, %r421, 2, %rd7; + cvt.s64.s32 %rd190, %r19; + add.s64 %rd191, %rd190, %rd185; + shl.b64 %rd192, %rd191, 1; + add.s64 %rd193, %rd7, %rd192; + add.s64 %rd111, %rd193, 16; + add.s64 %rd113, %rd193, 32; + add.s64 %rd115, %rd193, 48; + add.s64 %rd117, %rd193, 64; + add.s64 %rd119, %rd193, 80; + add.s64 %rd121, %rd193, 96; + add.s64 %rd123, %rd193, 112; + .loc 1 121 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71 + // begin inline asm + mov.u64 %rd108, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd108, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs66, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs66 }, [ %rd109 + 0 ], %rd108; + // end inline asm + // begin inline asm + mov.u64 %rd110, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd110, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs67, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs67 }, [ %rd111 + 0 ], %rd110; + // end inline asm + // begin inline asm + mov.u64 %rd112, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd112, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs68, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs68 }, [ %rd113 + 0 ], %rd112; + // end inline asm + // begin inline asm + mov.u64 %rd114, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd114, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs69, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs69 }, [ %rd115 + 0 ], %rd114; + // end inline asm + // begin inline asm + mov.u64 %rd116, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd116, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs70, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs70 }, [ %rd117 + 0 ], %rd116; + // end inline asm + // begin inline asm + mov.u64 %rd118, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd118, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs71, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs71 }, [ %rd119 + 0 ], %rd118; + // end inline asm + // begin inline asm + mov.u64 %rd120, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd120, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs72, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs72 }, [ %rd121 + 0 ], %rd120; + // end inline asm + // begin inline asm + mov.u64 %rd122, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd122, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs73, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs73 }, [ %rd123 + 0 ], %rd122; + // end inline asm + .loc 1 121 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132 + cvt.f32.bf16 %r422, %rs66; + cvt.f32.bf16 %r423, %rs67; + cvt.f32.bf16 %r424, %rs68; + cvt.f32.bf16 %r425, %rs69; + cvt.f32.bf16 %r426, %rs70; + cvt.f32.bf16 %r427, %rs71; + cvt.f32.bf16 %r428, %rs72; + cvt.f32.bf16 %r429, %rs73; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + mul.f32 %r430, %r6, %r422; + mul.f32 %r431, %r6, %r423; + mul.f32 %r432, %r6, %r424; + mul.f32 %r433, %r6, %r425; + mul.f32 %r434, %r6, %r426; + mul.f32 %r435, %r6, %r427; + mul.f32 %r436, %r6, %r428; + mul.f32 %r437, %r6, %r429; + .loc 1 127 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35 + add.s64 %rd157, %rd11, %rd189; + add.s64 %rd125, %rd157, 2; + add.s64 %rd127, %rd157, 18; + add.s64 %rd129, %rd157, 34; + add.s64 %rd131, %rd157, 50; + add.s64 %rd133, %rd157, 66; + add.s64 %rd135, %rd157, 82; + add.s64 %rd137, %rd157, 98; + add.s64 %rd139, %rd157, 114; + .loc 1 127 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85 + // begin inline asm + mov.u64 %rd124, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd124, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs74, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs74 }, [ %rd125 + 0 ], %rd124; + // end inline asm + // begin inline asm + mov.u64 %rd126, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd126, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs75, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs75 }, [ %rd127 + 0 ], %rd126; + // end inline asm + // begin inline asm + mov.u64 %rd128, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd128, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs76, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs76 }, [ %rd129 + 0 ], %rd128; + // end inline asm + // begin inline asm + mov.u64 %rd130, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd130, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs77, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs77 }, [ %rd131 + 0 ], %rd130; + // end inline asm + // begin inline asm + mov.u64 %rd132, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd132, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs78, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs78 }, [ %rd133 + 0 ], %rd132; + // end inline asm + // begin inline asm + mov.u64 %rd134, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd134, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs79, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs79 }, [ %rd135 + 0 ], %rd134; + // end inline asm + // begin inline asm + mov.u64 %rd136, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd136, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs80, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs80 }, [ %rd137 + 0 ], %rd136; + // end inline asm + // begin inline asm + mov.u64 %rd138, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd138, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs81, %rs34; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs81 }, [ %rd139 + 0 ], %rd138; + // end inline asm + .loc 1 127 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146 + cvt.f32.bf16 %r438, %rs74; + cvt.f32.bf16 %r439, %rs75; + cvt.f32.bf16 %r440, %rs76; + cvt.f32.bf16 %r441, %rs77; + cvt.f32.bf16 %r442, %rs78; + cvt.f32.bf16 %r443, %rs79; + cvt.f32.bf16 %r444, %rs80; + cvt.f32.bf16 %r445, %rs81; + .loc 1 131 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17 + neg.f32 %r446, %r430; + fma.rn.f32 %r447, %r446, %r438, 0f00000000; + neg.f32 %r448, %r431; + fma.rn.f32 %r449, %r448, %r439, 0f00000000; + neg.f32 %r450, %r432; + fma.rn.f32 %r451, %r450, %r440, 0f00000000; + neg.f32 %r452, %r433; + fma.rn.f32 %r453, %r452, %r441, 0f00000000; + neg.f32 %r454, %r434; + fma.rn.f32 %r455, %r454, %r442, 0f00000000; + neg.f32 %r456, %r435; + fma.rn.f32 %r457, %r456, %r443, 0f00000000; + neg.f32 %r458, %r436; + fma.rn.f32 %r459, %r458, %r444, 0f00000000; + neg.f32 %r460, %r437; + fma.rn.f32 %r461, %r460, %r445, 0f00000000; + .loc 1 134 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60 + or.b32 %r462, %r20, %r299; + .loc 1 134 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35 + mad.wide.s32 %rd141, %r462, 2, %rd7; + cvt.s64.s32 %rd194, %r20; + add.s64 %rd195, %rd194, %rd185; + shl.b64 %rd196, %rd195, 1; + add.s64 %rd197, %rd7, %rd196; + add.s64 %rd143, %rd197, 16; + add.s64 %rd145, %rd197, 32; + add.s64 %rd147, %rd197, 48; + add.s64 %rd149, %rd197, 64; + add.s64 %rd151, %rd197, 80; + add.s64 %rd153, %rd197, 96; + add.s64 %rd155, %rd197, 112; + .loc 1 134 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71 + // begin inline asm + mov.u64 %rd140, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd140, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs82, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs82 }, [ %rd141 + 0 ], %rd140; + // end inline asm + // begin inline asm + mov.u64 %rd142, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd142, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs83, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs83 }, [ %rd143 + 0 ], %rd142; + // end inline asm + // begin inline asm + mov.u64 %rd144, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd144, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs84, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs84 }, [ %rd145 + 0 ], %rd144; + // end inline asm + // begin inline asm + mov.u64 %rd146, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd146, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs85, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs85 }, [ %rd147 + 0 ], %rd146; + // end inline asm + // begin inline asm + mov.u64 %rd148, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd148, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs86, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs86 }, [ %rd149 + 0 ], %rd148; + // end inline asm + // begin inline asm + mov.u64 %rd150, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd150, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs87, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs87 }, [ %rd151 + 0 ], %rd150; + // end inline asm + // begin inline asm + mov.u64 %rd152, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd152, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs88, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs88 }, [ %rd153 + 0 ], %rd152; + // end inline asm + // begin inline asm + mov.u64 %rd154, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd154, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs89, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs89 }, [ %rd155 + 0 ], %rd154; + // end inline asm + .loc 1 134 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132 + cvt.f32.bf16 %r463, %rs82; + cvt.f32.bf16 %r464, %rs83; + cvt.f32.bf16 %r465, %rs84; + cvt.f32.bf16 %r466, %rs85; + cvt.f32.bf16 %r467, %rs86; + cvt.f32.bf16 %r468, %rs87; + cvt.f32.bf16 %r469, %rs88; + cvt.f32.bf16 %r470, %rs89; + .loc 1 139 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24 + mul.f32 %r471, %r6, %r463; + mul.f32 %r472, %r6, %r464; + mul.f32 %r473, %r6, %r465; + mul.f32 %r474, %r6, %r466; + mul.f32 %r475, %r6, %r467; + mul.f32 %r476, %r6, %r468; + mul.f32 %r477, %r6, %r469; + mul.f32 %r478, %r6, %r470; + .loc 1 140 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35 + add.s64 %rd159, %rd157, 16; + add.s64 %rd161, %rd157, 32; + add.s64 %rd163, %rd157, 48; + add.s64 %rd165, %rd157, 64; + add.s64 %rd167, %rd157, 80; + add.s64 %rd169, %rd157, 96; + add.s64 %rd171, %rd157, 112; + .loc 1 140 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81 + // begin inline asm + mov.u64 %rd156, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd156, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs90, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs90 }, [ %rd157 + 0 ], %rd156; + // end inline asm + // begin inline asm + mov.u64 %rd158, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd158, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs91, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs91 }, [ %rd159 + 0 ], %rd158; + // end inline asm + // begin inline asm + mov.u64 %rd160, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd160, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs92, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs92 }, [ %rd161 + 0 ], %rd160; + // end inline asm + // begin inline asm + mov.u64 %rd162, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd162, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs93, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs93 }, [ %rd163 + 0 ], %rd162; + // end inline asm + // begin inline asm + mov.u64 %rd164, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd164, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs94, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs94 }, [ %rd165 + 0 ], %rd164; + // end inline asm + // begin inline asm + mov.u64 %rd166, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd166, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs95, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs95 }, [ %rd167 + 0 ], %rd166; + // end inline asm + // begin inline asm + mov.u64 %rd168, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd168, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs96, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs96 }, [ %rd169 + 0 ], %rd168; + // end inline asm + // begin inline asm + mov.u64 %rd170, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd170, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs97, %rs34; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs97 }, [ %rd171 + 0 ], %rd170; + // end inline asm + .loc 1 140 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142 + cvt.f32.bf16 %r479, %rs90; + cvt.f32.bf16 %r480, %rs91; + cvt.f32.bf16 %r481, %rs92; + cvt.f32.bf16 %r482, %rs93; + cvt.f32.bf16 %r483, %rs94; + cvt.f32.bf16 %r484, %rs95; + cvt.f32.bf16 %r485, %rs96; + cvt.f32.bf16 %r486, %rs97; + .loc 1 142 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24 + mul.f32 %r487, %r471, %r479; + mul.f32 %r488, %r472, %r480; + mul.f32 %r489, %r473, %r481; + mul.f32 %r490, %r474, %r482; + mul.f32 %r491, %r475, %r483; + mul.f32 %r492, %r476, %r484; + mul.f32 %r493, %r477, %r485; + mul.f32 %r494, %r478, %r486; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r495, %r447, %r487, %p3; + selp.f32 %r496, %r449, %r488, %p3; + selp.f32 %r497, %r451, %r489, %p3; + selp.f32 %r498, %r453, %r490, %p3; + selp.f32 %r499, %r455, %r491, %p3; + selp.f32 %r500, %r457, %r492, %p3; + selp.f32 %r501, %r459, %r493, %p3; + selp.f32 %r502, %r461, %r494, %p3; + .loc 1 151 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25 + mul.f32 %r503, %r5, %r283; + mul.f32 %r504, %r5, %r284; + mul.f32 %r505, %r5, %r285; + mul.f32 %r506, %r5, %r286; + mul.f32 %r507, %r5, %r287; + mul.f32 %r508, %r5, %r288; + mul.f32 %r509, %r5, %r289; + mul.f32 %r510, %r5, %r290; + .loc 1 153 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26 + mul.f32 %r511, %r503, %r291; + mul.f32 %r512, %r504, %r292; + mul.f32 %r513, %r505, %r293; + mul.f32 %r514, %r506, %r294; + mul.f32 %r515, %r507, %r295; + mul.f32 %r516, %r508, %r296; + mul.f32 %r517, %r509, %r297; + mul.f32 %r518, %r510, %r298; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + mul.f32 %r519, %r511, %r224; + mul.f32 %r520, %r512, %r225; + mul.f32 %r521, %r513, %r226; + mul.f32 %r522, %r514, %r227; + mul.f32 %r523, %r515, %r228; + mul.f32 %r524, %r516, %r229; + mul.f32 %r525, %r517, %r230; + mul.f32 %r526, %r518, %r231; + bar.sync 0; + st.shared.b32 [%r9], %r519; + st.shared.b32 [%r9+256], %r520; + st.shared.b32 [%r9+512], %r521; + st.shared.b32 [%r9+768], %r522; + st.shared.b32 [%r9+1024], %r523; + st.shared.b32 [%r9+1280], %r524; + st.shared.b32 [%r9+1536], %r525; + st.shared.b32 [%r9+1792], %r526; + bar.sync 0; + ld.shared.b32 %r527, [%r10]; + ld.shared.b32 %r528, [%r11]; + ld.shared.b32 %r529, [%r12]; + ld.shared.b32 %r530, [%r13]; + ld.shared.b32 %r531, [%r14]; + ld.shared.b32 %r532, [%r15]; + ld.shared.b32 %r533, [%r16]; + ld.shared.b32 %r534, [%r17]; + .loc 1 159 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26 + fma.rn.f32 %r535, %r274, %r495, %r527; + fma.rn.f32 %r536, %r275, %r496, %r528; + fma.rn.f32 %r537, %r276, %r497, %r529; + fma.rn.f32 %r538, %r277, %r498, %r530; + fma.rn.f32 %r539, %r278, %r499, %r531; + fma.rn.f32 %r540, %r279, %r500, %r532; + fma.rn.f32 %r541, %r280, %r501, %r533; + fma.rn.f32 %r542, %r281, %r502, %r534; + .loc 1 161 39 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39 + or.b64 %rd198, %rd174, %rd4; + .loc 1 161 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32 + shl.b64 %rd199, %rd198, 1; + add.s64 %rd172, %rd5, %rd199; + .loc 1 161 55 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55 + cvt.rn.bf16.f32 %rs130, %r413; + cvt.rn.bf16.f32 %rs131, %r414; + cvt.rn.bf16.f32 %rs132, %r415; + cvt.rn.bf16.f32 %rs133, %r416; + cvt.rn.bf16.f32 %rs134, %r417; + cvt.rn.bf16.f32 %rs135, %r418; + cvt.rn.bf16.f32 %rs136, %r419; + cvt.rn.bf16.f32 %rs137, %r420; + bar.sync 0; + st.shared.b16 [%r21], %rs130; + st.shared.b16 [%r22], %rs131; + st.shared.b16 [%r23], %rs132; + st.shared.b16 [%r24], %rs133; + st.shared.b16 [%r25], %rs134; + st.shared.b16 [%r26], %rs135; + st.shared.b16 [%r27], %rs136; + st.shared.b16 [%r28], %rs137; + bar.sync 0; + ld.shared.b32 %r248, [%r29]; + ld.shared.b32 %r249, [%r29+256]; + ld.shared.b32 %r250, [%r29+512]; + ld.shared.b32 %r251, [%r29+768]; + // begin inline asm + @%p2 st.global.v4.b32 [ %rd172 + 0 ], { %r248, %r249, %r250, %r251 }; + // end inline asm + .loc 1 162 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32 + add.s64 %rd173, %rd6, %rd199; + .loc 1 162 56 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56 + cvt.rn.bf16.f32 %rs138, %r535; + cvt.rn.bf16.f32 %rs139, %r536; + cvt.rn.bf16.f32 %rs140, %r537; + cvt.rn.bf16.f32 %rs141, %r538; + cvt.rn.bf16.f32 %rs142, %r539; + cvt.rn.bf16.f32 %rs143, %r540; + cvt.rn.bf16.f32 %rs144, %r541; + cvt.rn.bf16.f32 %rs145, %r542; + bar.sync 0; + st.shared.b16 [%r21], %rs138; + st.shared.b16 [%r22], %rs139; + st.shared.b16 [%r23], %rs140; + st.shared.b16 [%r24], %rs141; + st.shared.b16 [%r25], %rs142; + st.shared.b16 [%r26], %rs143; + st.shared.b16 [%r27], %rs144; + st.shared.b16 [%r28], %rs145; + bar.sync 0; + ld.shared.b32 %r252, [%r29]; + ld.shared.b32 %r253, [%r29+256]; + ld.shared.b32 %r254, [%r29+512]; + ld.shared.b32 %r255, [%r29+768]; + // begin inline asm + @%p2 st.global.v4.b32 [ %rd173 + 0 ], { %r252, %r253, %r254, %r255 }; + // end inline asm + mov.b64 %rd200, 64; + mov.pred %p5, 0; + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + @%p1 bra $L__BB0_1; +// %bb.2: + .loc 1 53 4 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4 + ret; +$L__tmp16: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 456 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 118 +.b8 113 +.b8 104 +.b8 106 +.b8 116 +.b8 121 +.b8 103 +.b8 55 +.b8 102 +.b8 118 +.b8 120 +.b8 122 +.b8 119 +.b8 116 +.b8 98 +.b8 116 +.b8 116 +.b8 52 +.b8 118 +.b8 114 +.b8 100 +.b8 107 +.b8 98 +.b8 110 +.b8 98 +.b8 54 +.b8 110 +.b8 51 +.b8 50 +.b8 102 +.b8 110 +.b8 114 +.b8 105 +.b8 106 +.b8 106 +.b8 112 +.b8 108 +.b8 51 +.b8 118 +.b8 118 +.b8 52 +.b8 99 +.b8 102 +.b8 113 +.b8 100 +.b8 52 +.b8 109 +.b8 122 +.b8 110 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 98 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 101 +.b8 103 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 119 +.b8 105 +.b8 116 +.b8 104 +.b8 95 +.b8 115 +.b8 105 +.b8 122 +.b8 101 +.b8 115 +.b8 95 +.b8 115 +.b8 116 +.b8 97 +.b8 99 +.b8 107 +.b8 95 +.b8 117 +.b8 110 +.b8 98 +.b8 105 +.b8 110 +.b8 100 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x151:0x7a DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 4 // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp8 // DW_AT_low_pc +.b64 $L__tmp15 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp8 // DW_AT_low_pc +.b64 $L__tmp15 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..96fc24d835acf2c6fce5ada7026bbe34b256d6ff --- /dev/null +++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source @@ -0,0 +1,972 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc215 = loc(unknown) +#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc222 = loc("in_out_ptr0"(#loc)) +#loc223 = loc("in_out_ptr1"(#loc)) +#loc224 = loc("in_ptr0"(#loc)) +#loc225 = loc("in_ptr1"(#loc)) +#loc226 = loc("in_ptr2"(#loc)) +#loc227 = loc("in_ptr3"(#loc)) +#loc228 = loc("in_ptr4"(#loc)) +#loc229 = loc("xnumel"(#loc)) +#loc230 = loc("r0_numel"(#loc)) +#loc432 = loc("input"(#loc213)) +#loc433 = loc("a"(#loc218)) +#loc434 = loc("b"(#loc218)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 73728 : i32 loc(#loc231) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc232) + %xoffset = tt.get_program_id x : i32 loc(#loc233) + %xoffset_2 = arith.constant 64 : i32 loc(#loc234) + %xoffset_3 = arith.constant 64 : i32 loc(#loc234) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc235) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc236) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc237) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc237) + %xmask = arith.constant true loc(#loc238) + %xmask_8 = arith.constant dense : tensor<64x64xi1> loc(#loc238) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc239) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc240) + %x0 = arith.constant 32 : i32 loc(#loc241) + %x0_10 = arith.constant 32 : i32 loc(#loc241) + %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc241) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc241) + %x1 = arith.constant 32 : i32 loc(#loc242) + %x1_13 = arith.constant 32 : i32 loc(#loc242) + %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc242) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc242) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc243) + %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244) + %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc244) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c64_i32 = arith.constant 64 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<64x64xf32>, tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc246) + %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc246) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc247) + %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x64xi32> loc(#loc247) + %tmp0 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_27 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_28 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc248) + %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x64xi32> loc(#loc248) + %tmp0_30 = arith.constant 128 : i32 loc(#loc249) + %tmp0_31 = arith.constant 128 : i32 loc(#loc249) + %tmp0_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc249) + %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<64x1xi32> loc(#loc249) + %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc250) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc250) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<64x64xi32> loc(#loc250) + %tmp0_37 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_38 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc251) + %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<64x1xi32> loc(#loc251) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc252) + %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<64x64xi32> loc(#loc252) + %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc253) + %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc253) + %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254) + %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc254) + %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc254) + %tmp0_48 = arith.truncf %tmp0_47 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc254) + %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc254) + %tmp0_50 = arith.extf %tmp0_49 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc255) + %tmp6 = arith.constant 128 : i32 loc(#loc256) + %tmp6_51 = arith.constant 128 : i32 loc(#loc256) + %tmp6_52 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc256) + %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<64x1xi32> loc(#loc256) + %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc257) + %tmp6_55 = tt.broadcast %tmp6_53 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc257) + %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<64x64xi32> loc(#loc257) + %tmp6_57 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_58 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_59 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc258) + %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<64x1xi32> loc(#loc258) + %tmp6_61 = tt.broadcast %tmp6_60 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc259) + %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<64x64xi32> loc(#loc259) + %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc260) + %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc260) + %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261) + %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc261) + %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc261) + %tmp6_68 = arith.truncf %tmp6_67 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc261) + %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc261) + %tmp6_70 = arith.extf %tmp6_69 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc262) + %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<64x64xf32> loc(#loc263) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x64xf32> loc(#loc264) + %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc265) + %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc265) + %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<64x64xf32> loc(#loc266) + %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<64x64xf32> loc(#loc267) + %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc268) + %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc268) + scf.yield %_tmp4_72, %_tmp10_74 : tensor<64x64xf32>, tensor<64x64xf32> loc(#loc39) + } loc(#loc435) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc269) + %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc270) + %tmp10 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc271) + %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc272) + %c0_i32_21 = arith.constant 0 : i32 loc(#loc44) + %c64_i32_22 = arith.constant 64 : i32 loc(#loc44) + %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44) + %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44) + %6 = arith.bitcast %c64_i32_22 : i32 to i32 loc(#loc44) + %7 = ub.poison : i32 loc(#loc44) + scf.for %r0_offset = %4 to %5 step %6 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc273) + %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc273) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc274) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x64xi32> loc(#loc274) + %r0_3 = arith.constant 2 : i32 loc(#loc275) + %r0_3_25 = arith.constant 2 : i32 loc(#loc275) + %r0_3_26 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc275) + %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x64xi32> loc(#loc275) + %r0_4 = arith.constant 2 : i32 loc(#loc276) + %r0_4_28 = arith.constant 2 : i32 loc(#loc276) + %r0_4_29 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc276) + %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x64xi32> loc(#loc276) + %tmp50 = arith.constant 128 : i32 loc(#loc277) + %tmp50_31 = arith.constant 128 : i32 loc(#loc277) + %tmp50_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc277) + %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<64x1xi32> loc(#loc277) + %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc278) + %tmp50_35 = tt.broadcast %tmp50_33 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc278) + %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<64x64xi32> loc(#loc278) + %tmp50_37 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_38 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc279) + %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<64x1xi32> loc(#loc279) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc280) + %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<64x64xi32> loc(#loc280) + %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc281) + %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc281) + %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282) + %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc282) + %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc282) + %tmp50_48 = arith.truncf %tmp50_47 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc282) + %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc282) + %tmp50_50 = arith.extf %tmp50_49 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc283) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc284) + %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc284) + %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285) + %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc285) + %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc285) + %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc285) + %tmp58_56 = arith.extf %tmp58_55 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc286) + %tmp63 = arith.constant 128 : i32 loc(#loc287) + %tmp63_57 = arith.constant 128 : i32 loc(#loc287) + %tmp63_58 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc287) + %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<64x1xi32> loc(#loc287) + %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc288) + %tmp63_61 = tt.broadcast %tmp63_59 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc288) + %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<64x64xi32> loc(#loc288) + %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc289) + %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc289) + %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290) + %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc290) + %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc290) + %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc290) + %tmp66 = arith.constant 128 : i32 loc(#loc291) + %tmp66_69 = arith.constant 128 : i32 loc(#loc291) + %tmp66_70 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc291) + %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<64x1xi32> loc(#loc291) + %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc292) + %tmp66_73 = tt.broadcast %tmp66_71 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc292) + %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<64x64xi32> loc(#loc292) + %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc293) + %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc293) + %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294) + %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc294) + %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc294) + %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc294) + %tmp96 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_81 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_82 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc295) + %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x64xi32> loc(#loc295) + %tmp96_84 = arith.constant 128 : i32 loc(#loc296) + %tmp96_85 = arith.constant 128 : i32 loc(#loc296) + %tmp96_86 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc296) + %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<64x1xi32> loc(#loc296) + %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc297) + %tmp96_89 = tt.broadcast %tmp96_87 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc297) + %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<64x64xi32> loc(#loc297) + %tmp96_91 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_92 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_93 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc298) + %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<64x1xi32> loc(#loc298) + %tmp96_95 = tt.broadcast %tmp96_94 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc299) + %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<64x64xi32> loc(#loc299) + %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc300) + %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc300) + %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301) + %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc301) + %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc301) + %tmp96_102 = arith.truncf %tmp96_101 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc301) + %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc301) + %tmp96_104 = arith.extf %tmp96_103 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc302) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc303) + %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc303) + %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304) + %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc304) + %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc304) + %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc304) + %tmp102_110 = arith.extf %tmp102_109 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc305) + %tmp13 = arith.constant 0 : i64 loc(#loc306) + %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306) + %tmp14 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc307) + %tmp14_112 = arith.constant dense<0> : tensor<1x64xi64> loc(#loc307) + %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x64xi64> loc(#loc307) + %tmp15 = arith.constant 1 : i64 loc(#loc308) + %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308) + %tmp16 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc309) + %tmp16_115 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc309) + %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x64xi64> loc(#loc309) + %tmp17 = arith.constant 2 : i32 loc(#loc310) + %tmp17_117 = arith.constant 2 : i32 loc(#loc310) + %tmp17_118 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc310) + %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x64xi32> loc(#loc310) + %tmp17_120 = arith.constant 1 : i32 loc(#loc311) + %tmp17_121 = arith.constant 1 : i32 loc(#loc311) + %tmp17_122 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc311) + %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x64xi32> loc(#loc311) + %tmp17_124 = arith.constant 128 : i32 loc(#loc312) + %tmp17_125 = arith.constant 128 : i32 loc(#loc312) + %tmp17_126 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc312) + %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<64x1xi32> loc(#loc312) + %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc313) + %tmp17_129 = tt.broadcast %tmp17_127 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc313) + %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<64x64xi32> loc(#loc313) + %tmp17_131 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_132 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_133 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc314) + %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<64x1xi32> loc(#loc314) + %tmp17_135 = tt.broadcast %tmp17_134 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc315) + %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<64x64xi32> loc(#loc315) + %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc316) + %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc316) + %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc317) + %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318) + %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc318) + %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc318) + %tmp17_143 = arith.truncf %tmp17_142 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc318) + %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc318) + %tmp17_145 = arith.extf %tmp17_144 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc319) + %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320) + %tmp20 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc321) + %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<64x1xf32> loc(#loc321) + %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322) + %tmp22 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc323) + %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<64x1xf32> loc(#loc323) + %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc324) + %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc325) + %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<64x64xf32> loc(#loc325) + %tmp25 = arith.constant 2 : i32 loc(#loc326) + %tmp25_149 = arith.constant 2 : i32 loc(#loc326) + %tmp25_150 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc326) + %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x64xi32> loc(#loc326) + %tmp25_152 = arith.constant 1 : i32 loc(#loc327) + %tmp25_153 = arith.constant 1 : i32 loc(#loc327) + %tmp25_154 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc327) + %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x64xi32> loc(#loc327) + %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc328) + %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc329) + %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc329) + %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc330) + %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331) + %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc331) + %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc331) + %tmp25_163 = arith.truncf %tmp25_162 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc331) + %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc331) + %tmp25_165 = arith.extf %tmp25_164 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc332) + %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<64x64xf32> loc(#loc333) + %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334) + %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc334) + %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<64x64xf32> loc(#loc334) + %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335) + %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc335) + %tmp31 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc336) + %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc336) + %tmp32 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc337) + %tmp32_170 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc337) + %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x64xi64> loc(#loc337) + %tmp33 = arith.constant 2 : i64 loc(#loc338) + %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338) + %tmp34 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc339) + %tmp34_173 = arith.constant dense<2> : tensor<1x64xi64> loc(#loc339) + %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x64xi64> loc(#loc339) + %tmp35 = arith.constant 2 : i32 loc(#loc340) + %tmp35_175 = arith.constant 2 : i32 loc(#loc340) + %tmp35_176 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc340) + %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x64xi32> loc(#loc340) + %tmp35_178 = arith.constant 128 : i32 loc(#loc341) + %tmp35_179 = arith.constant 128 : i32 loc(#loc341) + %tmp35_180 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc341) + %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<64x1xi32> loc(#loc341) + %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc342) + %tmp35_183 = tt.broadcast %tmp35_181 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc342) + %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<64x64xi32> loc(#loc342) + %tmp35_185 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_186 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_187 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc343) + %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<64x1xi32> loc(#loc343) + %tmp35_189 = tt.broadcast %tmp35_188 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc344) + %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<64x64xi32> loc(#loc344) + %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc345) + %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc345) + %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc346) + %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347) + %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc347) + %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc347) + %tmp35_197 = arith.truncf %tmp35_196 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc347) + %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc347) + %tmp35_199 = arith.extf %tmp35_198 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc348) + %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349) + %tmp38 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc350) + %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<64x1xf32> loc(#loc350) + %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351) + %tmp40 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc352) + %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<64x1xf32> loc(#loc352) + %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc353) + %tmp42 = tt.broadcast %tmp41 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc354) + %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<64x64xf32> loc(#loc354) + %tmp43 = arith.constant 2 : i32 loc(#loc355) + %tmp43_203 = arith.constant 2 : i32 loc(#loc355) + %tmp43_204 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc355) + %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x64xi32> loc(#loc355) + %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc356) + %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc357) + %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc357) + %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc358) + %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359) + %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc359) + %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc359) + %tmp43_213 = arith.truncf %tmp43_212 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc359) + %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc359) + %tmp43_215 = arith.extf %tmp43_214 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc360) + %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<64x64xf32> loc(#loc361) + %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362) + %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc362) + %tmp48 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc363) + %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc363) + %tmp49 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc364) + %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc364) + %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365) + %tmp53 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc366) + %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<64x1xf32> loc(#loc366) + %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367) + %tmp55 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc368) + %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<64x1xf32> loc(#loc368) + %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc369) + %tmp57 = tt.broadcast %tmp56 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc370) + %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<64x64xf32> loc(#loc370) + %tmp60 = tt.broadcast %tmp58_56 : tensor<1x64xf32> -> tensor<64x64xf32> loc(#loc371) + %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<64x64xf32> loc(#loc371) + %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<64x64xf32> loc(#loc372) + %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<64x64xf32> loc(#loc373) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x64xf32> loc(#loc374) + %tmp70 = arith.constant 2 : i32 loc(#loc375) + %tmp70_223 = arith.constant 2 : i32 loc(#loc375) + %tmp70_224 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc375) + %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x64xi32> loc(#loc375) + %tmp70_226 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_227 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_228 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc376) + %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x64xi32> loc(#loc376) + %tmp70_230 = arith.constant 128 : i32 loc(#loc377) + %tmp70_231 = arith.constant 128 : i32 loc(#loc377) + %tmp70_232 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc377) + %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<64x1xi32> loc(#loc377) + %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc378) + %tmp70_235 = tt.broadcast %tmp70_233 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc378) + %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<64x64xi32> loc(#loc378) + %tmp70_237 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_238 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_239 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc379) + %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<64x1xi32> loc(#loc379) + %tmp70_241 = tt.broadcast %tmp70_240 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc380) + %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<64x64xi32> loc(#loc380) + %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc381) + %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc381) + %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc382) + %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383) + %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc383) + %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc383) + %tmp70_249 = arith.truncf %tmp70_248 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc383) + %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc383) + %tmp70_251 = arith.extf %tmp70_250 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc384) + %tmp72 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc385) + %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<64x1xf32> loc(#loc385) + %tmp73 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc386) + %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<64x1xf32> loc(#loc386) + %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc387) + %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc388) + %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<64x64xf32> loc(#loc388) + %tmp76 = arith.constant 2 : i32 loc(#loc389) + %tmp76_255 = arith.constant 2 : i32 loc(#loc389) + %tmp76_256 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc389) + %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x64xi32> loc(#loc389) + %tmp76_258 = arith.constant 1 : i32 loc(#loc390) + %tmp76_259 = arith.constant 1 : i32 loc(#loc390) + %tmp76_260 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc390) + %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x64xi32> loc(#loc390) + %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc391) + %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc392) + %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc392) + %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc393) + %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394) + %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc394) + %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc394) + %tmp76_269 = arith.truncf %tmp76_268 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc394) + %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc394) + %tmp76_271 = arith.extf %tmp76_270 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc395) + %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<64x64xf32> loc(#loc396) + %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397) + %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc397) + %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<64x64xf32> loc(#loc397) + %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398) + %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc398) + %tmp82 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc399) + %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc399) + %tmp83 = arith.constant 2 : i32 loc(#loc400) + %tmp83_276 = arith.constant 2 : i32 loc(#loc400) + %tmp83_277 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc400) + %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x64xi32> loc(#loc400) + %tmp83_279 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_280 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_281 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc401) + %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x64xi32> loc(#loc401) + %tmp83_283 = arith.constant 128 : i32 loc(#loc402) + %tmp83_284 = arith.constant 128 : i32 loc(#loc402) + %tmp83_285 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc402) + %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<64x1xi32> loc(#loc402) + %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc403) + %tmp83_288 = tt.broadcast %tmp83_286 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc403) + %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<64x64xi32> loc(#loc403) + %tmp83_290 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_291 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_292 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc404) + %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<64x1xi32> loc(#loc404) + %tmp83_294 = tt.broadcast %tmp83_293 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc405) + %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<64x64xi32> loc(#loc405) + %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc406) + %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc406) + %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc407) + %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408) + %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc408) + %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc408) + %tmp83_302 = arith.truncf %tmp83_301 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc408) + %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc408) + %tmp83_304 = arith.extf %tmp83_303 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc409) + %tmp85 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc410) + %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<64x1xf32> loc(#loc410) + %tmp86 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc411) + %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<64x1xf32> loc(#loc411) + %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc412) + %tmp88 = tt.broadcast %tmp87 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc413) + %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<64x64xf32> loc(#loc413) + %tmp89 = arith.constant 2 : i32 loc(#loc414) + %tmp89_308 = arith.constant 2 : i32 loc(#loc414) + %tmp89_309 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc414) + %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x64xi32> loc(#loc414) + %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc415) + %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc416) + %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc416) + %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc417) + %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418) + %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc418) + %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc418) + %tmp89_318 = arith.truncf %tmp89_317 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc418) + %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc418) + %tmp89_320 = arith.extf %tmp89_319 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc419) + %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<64x64xf32> loc(#loc420) + %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421) + %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc421) + %tmp94 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc422) + %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc422) + %tmp95 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc423) + %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc423) + %tmp98 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc424) + %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<64x1xf32> loc(#loc424) + %tmp99 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc425) + %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<64x1xf32> loc(#loc425) + %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc426) + %tmp101 = tt.broadcast %tmp100 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc427) + %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<64x64xf32> loc(#loc427) + %tmp104 = tt.broadcast %tmp102_110 : tensor<1x64xf32> -> tensor<64x64xf32> loc(#loc428) + %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<64x64xf32> loc(#loc428) + %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<64x64xf32> loc(#loc429) + %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<64x64xf32> loc(#loc430) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x64xf32> loc(#loc431) + %c128_i32 = arith.constant 128 : i32 loc(#loc204) + %c128_i32_328 = arith.constant 128 : i32 loc(#loc204) + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc204) + %8 = arith.muli %cst, %xindex_7 : tensor<64x1xi32> loc(#loc204) + %9 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc205) + %10 = tt.broadcast %8 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc205) + %11 = arith.addi %9, %10 : tensor<64x64xi32> loc(#loc205) + %12 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc206) + %13 = tt.addptr %12, %11 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc206) + %14 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc207) + %15 = arith.truncf %tmp68 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc207) + tt.store %13, %15, %14 : tensor<64x64x!tt.ptr> loc(#loc207) + %c128_i32_329 = arith.constant 128 : i32 loc(#loc208) + %c128_i32_330 = arith.constant 128 : i32 loc(#loc208) + %cst_331 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc208) + %16 = arith.muli %cst_331, %xindex_7 : tensor<64x1xi32> loc(#loc208) + %17 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc209) + %18 = tt.broadcast %16 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc209) + %19 = arith.addi %17, %18 : tensor<64x64xi32> loc(#loc209) + %20 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc210) + %21 = tt.addptr %20, %19 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc210) + %22 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc211) + %23 = arith.truncf %tmp110 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc211) + tt.store %21, %23, %22 : tensor<64x64x!tt.ptr> loc(#loc211) + } loc(#loc44) + tt.return loc(#loc212) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x64xf32> loc("input"(#loc213))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214) + tt.reduce.return %2 : f32 loc(#loc214) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc214) + tt.return %0 : tensor<64xf32> loc(#loc216) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc217) + tt.return %1 : tensor<64xf32> loc(#loc217) + } loc(#loc213) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc219) + tt.return %0 : f32 loc(#loc220) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc221) + tt.return %1 : f32 loc(#loc221) + } loc(#loc218) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55) +#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66) +#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81) +#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57) +#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55) +#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63) +#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95) +#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42) +#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44) +#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55) +#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66) +#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81) +#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24) +#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24) +#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32) +#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53) +#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59) +#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91) +#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42) +#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24) +#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24) +#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33) +#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43) +#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39) +#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc231 = loc("xnumel"(#loc1)) +#loc232 = loc("r0_numel"(#loc2)) +#loc233 = loc("xoffset"(#loc3)) +#loc234 = loc("xoffset"(#loc4)) +#loc235 = loc("xindex"(#loc5)) +#loc236 = loc("xindex"(#loc6)) +#loc237 = loc("xindex"(#loc7)) +#loc238 = loc("xmask"(#loc8)) +#loc239 = loc("r0_base"(#loc9)) +#loc240 = loc("r0_base"(#loc10)) +#loc241 = loc("x0"(#loc11)) +#loc242 = loc("x1"(#loc12)) +#loc243 = loc("_tmp4"(#loc13)) +#loc244 = loc("_tmp10"(#loc14)) +#loc245 = loc("_tmp4"(#loc15)) +#loc246 = loc("r0_index"(#loc16)) +#loc247 = loc("r0_mask"(#loc17)) +#loc248 = loc("tmp0"(#loc18)) +#loc249 = loc("tmp0"(#loc19)) +#loc250 = loc("tmp0"(#loc20)) +#loc251 = loc("tmp0"(#loc21)) +#loc252 = loc("tmp0"(#loc22)) +#loc253 = loc("tmp0"(#loc23)) +#loc254 = loc("tmp0"(#loc24)) +#loc255 = loc("tmp0"(#loc25)) +#loc256 = loc("tmp6"(#loc26)) +#loc257 = loc("tmp6"(#loc27)) +#loc258 = loc("tmp6"(#loc28)) +#loc259 = loc("tmp6"(#loc29)) +#loc260 = loc("tmp6"(#loc30)) +#loc261 = loc("tmp6"(#loc31)) +#loc262 = loc("tmp6"(#loc32)) +#loc263 = loc("tmp2"(#loc33)) +#loc264 = loc("tmp5"(#loc34)) +#loc265 = loc("_tmp4"(#loc35)) +#loc266 = loc("tmp8"(#loc36)) +#loc267 = loc("tmp11"(#loc37)) +#loc268 = loc("_tmp10"(#loc38)) +#loc269 = loc("tmp4"(#loc40)) +#loc270 = loc("tmp4"(#loc41)) +#loc271 = loc("tmp10"(#loc42)) +#loc272 = loc("tmp10"(#loc43)) +#loc273 = loc("r0_index"(#loc45)) +#loc274 = loc("r0_mask"(#loc46)) +#loc275 = loc("r0_3"(#loc47)) +#loc276 = loc("r0_4"(#loc48)) +#loc277 = loc("tmp50"(#loc49)) +#loc278 = loc("tmp50"(#loc50)) +#loc279 = loc("tmp50"(#loc51)) +#loc280 = loc("tmp50"(#loc52)) +#loc281 = loc("tmp50"(#loc53)) +#loc282 = loc("tmp50"(#loc54)) +#loc283 = loc("tmp50"(#loc55)) +#loc284 = loc("tmp58"(#loc56)) +#loc285 = loc("tmp58"(#loc57)) +#loc286 = loc("tmp58"(#loc58)) +#loc287 = loc("tmp63"(#loc59)) +#loc288 = loc("tmp63"(#loc60)) +#loc289 = loc("tmp63"(#loc61)) +#loc290 = loc("tmp63"(#loc62)) +#loc291 = loc("tmp66"(#loc63)) +#loc292 = loc("tmp66"(#loc64)) +#loc293 = loc("tmp66"(#loc65)) +#loc294 = loc("tmp66"(#loc66)) +#loc295 = loc("tmp96"(#loc67)) +#loc296 = loc("tmp96"(#loc68)) +#loc297 = loc("tmp96"(#loc69)) +#loc298 = loc("tmp96"(#loc70)) +#loc299 = loc("tmp96"(#loc71)) +#loc300 = loc("tmp96"(#loc72)) +#loc301 = loc("tmp96"(#loc73)) +#loc302 = loc("tmp96"(#loc74)) +#loc303 = loc("tmp102"(#loc75)) +#loc304 = loc("tmp102"(#loc76)) +#loc305 = loc("tmp102"(#loc77)) +#loc306 = loc("tmp13"(#loc78)) +#loc307 = loc("tmp14"(#loc79)) +#loc308 = loc("tmp15"(#loc80)) +#loc309 = loc("tmp16"(#loc81)) +#loc310 = loc("tmp17"(#loc82)) +#loc311 = loc("tmp17"(#loc83)) +#loc312 = loc("tmp17"(#loc84)) +#loc313 = loc("tmp17"(#loc85)) +#loc314 = loc("tmp17"(#loc86)) +#loc315 = loc("tmp17"(#loc87)) +#loc316 = loc("tmp17"(#loc88)) +#loc317 = loc("tmp17"(#loc89)) +#loc318 = loc("tmp17"(#loc90)) +#loc319 = loc("tmp17"(#loc91)) +#loc320 = loc("tmp19"(#loc92)) +#loc321 = loc("tmp20"(#loc93)) +#loc322 = loc("tmp21"(#loc94)) +#loc323 = loc("tmp22"(#loc95)) +#loc324 = loc("tmp23"(#loc96)) +#loc325 = loc("tmp24"(#loc97)) +#loc326 = loc("tmp25"(#loc98)) +#loc327 = loc("tmp25"(#loc99)) +#loc328 = loc("tmp25"(#loc100)) +#loc329 = loc("tmp25"(#loc101)) +#loc330 = loc("tmp25"(#loc102)) +#loc331 = loc("tmp25"(#loc103)) +#loc332 = loc("tmp25"(#loc104)) +#loc333 = loc("tmp27"(#loc105)) +#loc334 = loc("tmp29"(#loc106)) +#loc335 = loc("tmp30"(#loc107)) +#loc336 = loc("tmp31"(#loc108)) +#loc337 = loc("tmp32"(#loc109)) +#loc338 = loc("tmp33"(#loc110)) +#loc339 = loc("tmp34"(#loc111)) +#loc340 = loc("tmp35"(#loc112)) +#loc341 = loc("tmp35"(#loc113)) +#loc342 = loc("tmp35"(#loc114)) +#loc343 = loc("tmp35"(#loc115)) +#loc344 = loc("tmp35"(#loc116)) +#loc345 = loc("tmp35"(#loc117)) +#loc346 = loc("tmp35"(#loc118)) +#loc347 = loc("tmp35"(#loc119)) +#loc348 = loc("tmp35"(#loc120)) +#loc349 = loc("tmp37"(#loc121)) +#loc350 = loc("tmp38"(#loc122)) +#loc351 = loc("tmp39"(#loc123)) +#loc352 = loc("tmp40"(#loc124)) +#loc353 = loc("tmp41"(#loc125)) +#loc354 = loc("tmp42"(#loc126)) +#loc355 = loc("tmp43"(#loc127)) +#loc356 = loc("tmp43"(#loc128)) +#loc357 = loc("tmp43"(#loc129)) +#loc358 = loc("tmp43"(#loc130)) +#loc359 = loc("tmp43"(#loc131)) +#loc360 = loc("tmp43"(#loc132)) +#loc361 = loc("tmp45"(#loc133)) +#loc362 = loc("tmp47"(#loc134)) +#loc363 = loc("tmp48"(#loc135)) +#loc364 = loc("tmp49"(#loc136)) +#loc365 = loc("tmp52"(#loc137)) +#loc366 = loc("tmp53"(#loc138)) +#loc367 = loc("tmp54"(#loc139)) +#loc368 = loc("tmp55"(#loc140)) +#loc369 = loc("tmp56"(#loc141)) +#loc370 = loc("tmp57"(#loc142)) +#loc371 = loc("tmp60"(#loc143)) +#loc372 = loc("tmp64"(#loc144)) +#loc373 = loc("tmp67"(#loc145)) +#loc374 = loc("tmp68"(#loc146)) +#loc375 = loc("tmp70"(#loc147)) +#loc376 = loc("tmp70"(#loc148)) +#loc377 = loc("tmp70"(#loc149)) +#loc378 = loc("tmp70"(#loc150)) +#loc379 = loc("tmp70"(#loc151)) +#loc380 = loc("tmp70"(#loc152)) +#loc381 = loc("tmp70"(#loc153)) +#loc382 = loc("tmp70"(#loc154)) +#loc383 = loc("tmp70"(#loc155)) +#loc384 = loc("tmp70"(#loc156)) +#loc385 = loc("tmp72"(#loc157)) +#loc386 = loc("tmp73"(#loc158)) +#loc387 = loc("tmp74"(#loc159)) +#loc388 = loc("tmp75"(#loc160)) +#loc389 = loc("tmp76"(#loc161)) +#loc390 = loc("tmp76"(#loc162)) +#loc391 = loc("tmp76"(#loc163)) +#loc392 = loc("tmp76"(#loc164)) +#loc393 = loc("tmp76"(#loc165)) +#loc394 = loc("tmp76"(#loc166)) +#loc395 = loc("tmp76"(#loc167)) +#loc396 = loc("tmp78"(#loc168)) +#loc397 = loc("tmp80"(#loc169)) +#loc398 = loc("tmp81"(#loc170)) +#loc399 = loc("tmp82"(#loc171)) +#loc400 = loc("tmp83"(#loc172)) +#loc401 = loc("tmp83"(#loc173)) +#loc402 = loc("tmp83"(#loc174)) +#loc403 = loc("tmp83"(#loc175)) +#loc404 = loc("tmp83"(#loc176)) +#loc405 = loc("tmp83"(#loc177)) +#loc406 = loc("tmp83"(#loc178)) +#loc407 = loc("tmp83"(#loc179)) +#loc408 = loc("tmp83"(#loc180)) +#loc409 = loc("tmp83"(#loc181)) +#loc410 = loc("tmp85"(#loc182)) +#loc411 = loc("tmp86"(#loc183)) +#loc412 = loc("tmp87"(#loc184)) +#loc413 = loc("tmp88"(#loc185)) +#loc414 = loc("tmp89"(#loc186)) +#loc415 = loc("tmp89"(#loc187)) +#loc416 = loc("tmp89"(#loc188)) +#loc417 = loc("tmp89"(#loc189)) +#loc418 = loc("tmp89"(#loc190)) +#loc419 = loc("tmp89"(#loc191)) +#loc420 = loc("tmp91"(#loc192)) +#loc421 = loc("tmp93"(#loc193)) +#loc422 = loc("tmp94"(#loc194)) +#loc423 = loc("tmp95"(#loc195)) +#loc424 = loc("tmp98"(#loc196)) +#loc425 = loc("tmp99"(#loc197)) +#loc426 = loc("tmp100"(#loc198)) +#loc427 = loc("tmp101"(#loc199)) +#loc428 = loc("tmp104"(#loc200)) +#loc429 = loc("tmp107"(#loc201)) +#loc430 = loc("tmp109"(#loc202)) +#loc431 = loc("tmp110"(#loc203)) +#loc435 = loc("_tmp10"(#loc245)) diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..79669ec9185bdb64f7fa1a687f3989a52164dbbc --- /dev/null +++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir @@ -0,0 +1,546 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [16, 1], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc147 = loc("in_out_ptr0"(#loc)) +#loc148 = loc("in_out_ptr1"(#loc)) +#loc149 = loc("in_ptr0"(#loc)) +#loc150 = loc("in_ptr1"(#loc)) +#loc151 = loc("in_ptr2"(#loc)) +#loc152 = loc("in_ptr3"(#loc)) +#loc153 = loc("in_ptr4"(#loc)) +#loc154 = loc("xnumel"(#loc)) +#loc155 = loc("r0_numel"(#loc)) +#loc185 = loc("tmp4"(#loc33)) +#loc187 = loc("tmp10"(#loc36)) +#loc292 = loc(callsite(#loc1 at #loc185)) +#loc294 = loc(callsite(#loc1 at #loc187)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4097> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x64xbf16, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x64xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<2> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<36864> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<36864> : tensor<64x1xi32, #blocked1> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> loc(#loc1) + %cst_8 = arith.constant dense<4096> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<32> : tensor<64x1xi32, #blocked1> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked1> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_16 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32, #blocked1> loc(#loc1) + %cst_17 = arith.constant dense<1.280000e+02> : tensor<64x1xf32, #blocked1> loc(#loc1) + %cst_18 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1) + %cst_19 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc156) + %xoffset_20 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc157) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158) + %xindex_21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158) + %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc158) + %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc158) + %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc159) + %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked> loc(#loc159) + %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<64x1xi32, #blocked1> loc(#loc159) + %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<64x1xi32, #blocked> loc(#loc159) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160) + %r0_base_28 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160) + %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc160) + %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc160) + %x0 = arith.remsi %xindex_26, %cst_13 : tensor<64x1xi32, #blocked1> loc(#loc161) + %x0_31 = arith.remsi %xindex_27, %cst_12 : tensor<64x1xi32, #blocked> loc(#loc161) + %x1 = arith.divsi %xindex_26, %cst_13 : tensor<64x1xi32, #blocked1> loc(#loc162) + %x1_32 = arith.divsi %xindex_27, %cst_12 : tensor<64x1xi32, #blocked> loc(#loc162) + %tmp0 = arith.muli %x0, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc163) + %tmp0_33 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc164) + %tmp0_34 = arith.muli %x1, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc165) + %tmp0_35 = tt.broadcast %tmp0_34 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc166) + %tmp0_36 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked1> loc(#loc167) + %_tmp10:2 = scf.for %_tmp10_51 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg10 = %cst_19, %arg11 = %cst_19) -> (tensor<64x64xf32, #blocked1>, tensor<64x64xf32, #blocked1>) : i32 { + %r0_index = tt.splat %_tmp10_51 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc169) + %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc169) + %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc170) + %tmp0_53 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc171) + %tmp0_54 = tt.broadcast %tmp0_53 : tensor<1x64xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc164) + %tmp0_55 = arith.addi %tmp0_54, %tmp0_33 : tensor<64x64xi32, #blocked1> loc(#loc164) + %tmp0_56 = arith.addi %tmp0_55, %tmp0_35 : tensor<64x64xi32, #blocked1> loc(#loc166) + %tmp0_57 = tt.addptr %tmp0_36, %tmp0_56 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc167) + %tmp0_58 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc172) + %tmp0_59 = tt.load %tmp0_57, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked1> loc(#loc172) + %tmp0_60 = arith.extf %tmp0_59 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc173) + %tmp6 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc174) + %tmp6_61 = arith.addi %tmp6, %tmp0_33 : tensor<64x64xi32, #blocked1> loc(#loc174) + %tmp6_62 = arith.addi %tmp6_61, %tmp0_35 : tensor<64x64xi32, #blocked1> loc(#loc175) + %tmp6_63 = tt.addptr %tmp0_36, %tmp6_62 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc176) + %tmp6_64 = tt.load %tmp6_63, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked1> loc(#loc177) + %tmp6_65 = arith.extf %tmp6_64 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc178) + %tmp2 = arith.mulf %tmp0_60, %tmp0_60 : tensor<64x64xf32, #blocked1> loc(#loc179) + %tmp5 = arith.addf %arg10, %tmp2 : tensor<64x64xf32, #blocked1> loc(#loc180) + %_tmp4 = arith.select %tmp0_58, %tmp5, %arg10 : tensor<64x64xi1, #blocked1>, tensor<64x64xf32, #blocked1> loc(#loc181) + %tmp8 = arith.mulf %tmp6_65, %tmp6_65 : tensor<64x64xf32, #blocked1> loc(#loc182) + %tmp11 = arith.addf %arg11, %tmp8 : tensor<64x64xf32, #blocked1> loc(#loc183) + %_tmp10_66 = arith.select %tmp0_58, %tmp11, %arg11 : tensor<64x64xi1, #blocked1>, tensor<64x64xf32, #blocked1> loc(#loc184) + scf.yield %_tmp4, %_tmp10_66 : tensor<64x64xf32, #blocked1>, tensor<64x64xf32, #blocked1> loc(#loc31) + } loc(#loc290) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_51: f32 loc(callsite(#loc1 at #loc185)), %tmp4_52: f32 loc(callsite(#loc1 at #loc185))): + %tmp4_53 = arith.addf %tmp4_51, %tmp4_52 : f32 loc(#loc297) + tt.reduce.return %tmp4_53 : f32 loc(#loc291) + }) : (tensor<64x64xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291) + %tmp4_37 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc186) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_51: f32 loc(callsite(#loc1 at #loc187)), %tmp10_52: f32 loc(callsite(#loc1 at #loc187))): + %tmp10_53 = arith.addf %tmp10_51, %tmp10_52 : f32 loc(#loc298) + tt.reduce.return %tmp10_53 : f32 loc(#loc293) + }) : (tensor<64x64xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293) + %tmp10_38 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc188) + %tmp50 = arith.muli %x0_31, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc189) + %tmp50_39 = tt.broadcast %tmp50 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc190) + %tmp50_40 = arith.muli %x1_32, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc191) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc192) + %tmp50_42 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc193) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked> loc(#loc194) + %tmp58_43 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc194) + %tmp63 = arith.muli %x1, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc195) + %tmp63_44 = tt.broadcast %tmp63 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc196) + %tmp63_45 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked1> loc(#loc197) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked1> loc(#loc198) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked> loc(#loc199) + %tmp102_46 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc199) + %tmp20 = arith.divf %tmp10_38, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc200) + %tmp22 = arith.addf %tmp20, %cst_16 : tensor<64x1xf32, #blocked1> loc(#loc201) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc202) + %tmp24 = ttg.convert_layout %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc203) + %tmp24_47 = tt.broadcast %tmp24 : tensor<64x1xf32, #blocked> -> tensor<64x64xf32, #blocked> loc(#loc203) + %tmp24_48 = tt.broadcast %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x64xf32, #blocked1> loc(#loc203) + %tmp72 = arith.divf %tmp4_37, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc204) + %tmp73 = arith.addf %tmp72, %cst_16 : tensor<64x1xf32, #blocked1> loc(#loc205) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc206) + %tmp75 = ttg.convert_layout %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc207) + %tmp75_49 = tt.broadcast %tmp75 : tensor<64x1xf32, #blocked> -> tensor<64x64xf32, #blocked> loc(#loc207) + %tmp75_50 = tt.broadcast %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x64xf32, #blocked1> loc(#loc207) + %0 = arith.muli %xindex_26, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc57) + %1 = tt.broadcast %0 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc58) + %2 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked1> loc(#loc59) + %3 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked1> loc(#loc60) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked1> loc(#loc208) + %r0_index_51 = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked> loc(#loc208) + %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc208) + %r0_index_53 = arith.addi %r0_index_51, %r0_base_30 : tensor<1x64xi32, #blocked> loc(#loc208) + %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc209) + %r0_mask_54 = arith.cmpi slt, %r0_index_53, %cst_10 : tensor<1x64xi32, #blocked> loc(#loc209) + %r0_3 = arith.remsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc210) + %r0_4 = arith.divsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc211) + %tmp50_55 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc190) + %tmp50_56 = arith.addi %tmp50_55, %tmp0_33 : tensor<64x64xi32, #blocked1> loc(#loc190) + %tmp50_57 = arith.addi %tmp50_56, %tmp0_35 : tensor<64x64xi32, #blocked1> loc(#loc192) + %tmp50_58 = tt.addptr %tmp0_36, %tmp50_57 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc193) + %tmp50_59 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc212) + %tmp50_60 = tt.load %tmp50_58, %tmp50_59, %cst_14 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked1> loc(#loc212) + %tmp50_61 = arith.extf %tmp50_60 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc213) + %tmp58_62 = tt.addptr %tmp58_43, %r0_index_52 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc194) + %tmp58_63 = tt.load %tmp58_62, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr, #blocked1> loc(#loc214) + %tmp58_64 = arith.extf %tmp58_63 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc215) + %tmp63_65 = arith.addi %tmp50_55, %tmp63_44 : tensor<64x64xi32, #blocked1> loc(#loc196) + %tmp63_66 = tt.addptr %tmp63_45, %tmp63_65 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc197) + %tmp63_67 = tt.load %tmp63_66, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked1> loc(#loc216) + %tmp66_68 = tt.addptr %tmp66, %tmp63_65 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc198) + %tmp66_69 = tt.load %tmp66_68, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked1> loc(#loc217) + %tmp66_70 = ttg.convert_layout %tmp66_69 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked> loc(#loc217) + %tmp96 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc218) + %tmp96_71 = tt.broadcast %tmp96 : tensor<1x64xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc219) + %tmp96_72 = arith.addi %tmp96_71, %tmp0_33 : tensor<64x64xi32, #blocked1> loc(#loc219) + %tmp96_73 = arith.addi %tmp96_72, %tmp0_35 : tensor<64x64xi32, #blocked1> loc(#loc220) + %tmp96_74 = tt.addptr %tmp0_36, %tmp96_73 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc221) + %tmp96_75 = tt.load %tmp96_74, %tmp50_59, %cst_14 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked1> loc(#loc222) + %tmp96_76 = arith.extf %tmp96_75 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc223) + %tmp102_77 = tt.addptr %tmp102_46, %r0_index_52 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc199) + %tmp102_78 = tt.load %tmp102_77, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr, #blocked1> loc(#loc224) + %tmp102_79 = arith.extf %tmp102_78 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc225) + %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc226) + %tmp16_80 = arith.cmpi slt, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc226) + %tmp17 = arith.muli %r0_4, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc227) + %tmp17_81 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32, #blocked> loc(#loc228) + %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc229) + %tmp17_83 = arith.addi %tmp17_82, %tmp50_39 : tensor<64x64xi32, #blocked> loc(#loc229) + %tmp17_84 = arith.addi %tmp17_83, %tmp50_41 : tensor<64x64xi32, #blocked> loc(#loc230) + %tmp17_85 = tt.addptr %tmp50_42, %tmp17_84 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc231) + %tmp17_86 = arith.andi %r0_mask_54, %tmp16_80 : tensor<1x64xi1, #blocked> loc(#loc232) + %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc233) + %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked> loc(#loc233) + %tmp17_89 = arith.extf %tmp17_88 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc234) + %tmp24_90 = arith.mulf %tmp17_89, %tmp24_47 : tensor<64x64xf32, #blocked> loc(#loc203) + %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc235) + %tmp25_91 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr, #blocked> -> tensor<64x64x!tt.ptr, #blocked> loc(#loc235) + %tmp25_92 = tt.load %tmp25_91, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked> loc(#loc236) + %tmp25_93 = arith.extf %tmp25_92 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc237) + %tmp27 = arith.mulf %tmp24_90, %tmp25_93 : tensor<64x64xf32, #blocked> loc(#loc238) + %tmp29 = arith.subf %cst_18, %tmp27 : tensor<64x64xf32, #blocked> loc(#loc239) + %tmp31 = tt.broadcast %tmp16_80 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc240) + %tmp32 = arith.cmpi sge, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc241) + %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc242) + %tmp35_94 = arith.addi %tmp35, %tmp50_39 : tensor<64x64xi32, #blocked> loc(#loc242) + %tmp35_95 = arith.addi %tmp35_94, %tmp50_41 : tensor<64x64xi32, #blocked> loc(#loc243) + %tmp35_96 = tt.addptr %tmp50_42, %tmp35_95 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc244) + %tmp35_97 = arith.andi %r0_mask_54, %tmp32 : tensor<1x64xi1, #blocked> loc(#loc245) + %tmp35_98 = tt.broadcast %tmp35_97 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc246) + %tmp35_99 = tt.load %tmp35_96, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked> loc(#loc246) + %tmp35_100 = arith.extf %tmp35_99 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc247) + %tmp42 = arith.mulf %tmp35_100, %tmp24_47 : tensor<64x64xf32, #blocked> loc(#loc248) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc249) + %tmp43_101 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr, #blocked> -> tensor<64x64x!tt.ptr, #blocked> loc(#loc249) + %tmp43_102 = tt.load %tmp43_101, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked> loc(#loc250) + %tmp43_103 = arith.extf %tmp43_102 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc251) + %tmp45 = arith.mulf %tmp42, %tmp43_103 : tensor<64x64xf32, #blocked> loc(#loc252) + %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc253) + %tmp48_104 = arith.select %tmp48, %tmp45, %cst_18 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc253) + %tmp49 = arith.select %tmp31, %tmp29, %tmp48_104 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc295) + %tmp57 = arith.mulf %tmp50_61, %tmp24_48 : tensor<64x64xf32, #blocked1> loc(#loc255) + %tmp60 = tt.broadcast %tmp58_64 : tensor<1x64xf32, #blocked1> -> tensor<64x64xf32, #blocked1> loc(#loc256) + %tmp60_105 = arith.mulf %tmp57, %tmp60 : tensor<64x64xf32, #blocked1> loc(#loc256) + %tmp64 = arith.mulf %tmp60_105, %tmp63_67 : tensor<64x64xf32, #blocked1> loc(#loc257) + %tmp64_106 = ttg.convert_layout %tmp64 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked> loc(#loc257) + %tmp67 = arith.mulf %tmp49, %tmp66_70 : tensor<64x64xf32, #blocked> loc(#loc258) + %tmp68 = arith.addf %tmp64_106, %tmp67 : tensor<64x64xf32, #blocked> loc(#loc259) + %tmp70 = arith.addi %tmp17, %cst : tensor<1x64xi32, #blocked> loc(#loc260) + %tmp70_107 = tt.broadcast %tmp70 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc261) + %tmp70_108 = arith.addi %tmp70_107, %tmp50_39 : tensor<64x64xi32, #blocked> loc(#loc261) + %tmp70_109 = arith.addi %tmp70_108, %tmp50_41 : tensor<64x64xi32, #blocked> loc(#loc262) + %tmp70_110 = tt.addptr %tmp50_42, %tmp70_109 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc263) + %tmp70_111 = tt.load %tmp70_110, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked> loc(#loc264) + %tmp70_112 = arith.extf %tmp70_111 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc265) + %tmp75_113 = arith.mulf %tmp70_112, %tmp75_49 : tensor<64x64xf32, #blocked> loc(#loc207) + %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc266) + %tmp76_114 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr, #blocked> -> tensor<64x64x!tt.ptr, #blocked> loc(#loc266) + %tmp76_115 = tt.load %tmp76_114, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked> loc(#loc267) + %tmp76_116 = arith.extf %tmp76_115 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc268) + %tmp78 = arith.mulf %tmp75_113, %tmp76_116 : tensor<64x64xf32, #blocked> loc(#loc269) + %tmp80 = arith.subf %cst_18, %tmp78 : tensor<64x64xf32, #blocked> loc(#loc270) + %tmp83 = arith.addi %tmp17, %cst_8 : tensor<1x64xi32, #blocked> loc(#loc271) + %tmp83_117 = tt.broadcast %tmp83 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc272) + %tmp83_118 = arith.addi %tmp83_117, %tmp50_39 : tensor<64x64xi32, #blocked> loc(#loc272) + %tmp83_119 = arith.addi %tmp83_118, %tmp50_41 : tensor<64x64xi32, #blocked> loc(#loc273) + %tmp83_120 = tt.addptr %tmp50_42, %tmp83_119 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc274) + %tmp83_121 = tt.load %tmp83_120, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked> loc(#loc275) + %tmp83_122 = arith.extf %tmp83_121 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc276) + %tmp88 = arith.mulf %tmp83_122, %tmp75_49 : tensor<64x64xf32, #blocked> loc(#loc277) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc278) + %tmp89_123 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr, #blocked> -> tensor<64x64x!tt.ptr, #blocked> loc(#loc278) + %tmp89_124 = tt.load %tmp89_123, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked> loc(#loc279) + %tmp89_125 = arith.extf %tmp89_124 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc280) + %tmp91 = arith.mulf %tmp88, %tmp89_125 : tensor<64x64xf32, #blocked> loc(#loc281) + %tmp94 = arith.select %tmp48, %tmp91, %cst_18 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc282) + %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc296) + %tmp101 = arith.mulf %tmp96_76, %tmp75_50 : tensor<64x64xf32, #blocked1> loc(#loc285) + %tmp104 = tt.broadcast %tmp102_79 : tensor<1x64xf32, #blocked1> -> tensor<64x64xf32, #blocked1> loc(#loc286) + %tmp104_126 = arith.mulf %tmp101, %tmp104 : tensor<64x64xf32, #blocked1> loc(#loc286) + %tmp107 = arith.mulf %tmp104_126, %tmp63_67 : tensor<64x64xf32, #blocked1> loc(#loc287) + %tmp107_127 = ttg.convert_layout %tmp107 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked> loc(#loc287) + %tmp109 = arith.mulf %tmp95, %tmp66_70 : tensor<64x64xf32, #blocked> loc(#loc288) + %tmp110 = arith.addf %tmp107_127, %tmp109 : tensor<64x64xf32, #blocked> loc(#loc289) + %4 = arith.addi %tmp50_55, %1 : tensor<64x64xi32, #blocked1> loc(#loc58) + %5 = tt.addptr %2, %4 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc59) + %6 = arith.truncf %tmp68 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked> loc(#loc144) + %7 = ttg.convert_layout %6 : tensor<64x64xbf16, #blocked> -> tensor<64x64xbf16, #blocked1> loc(#loc144) + tt.store %5, %7, %tmp50_59 : tensor<64x64x!tt.ptr, #blocked1> loc(#loc144) + %8 = tt.addptr %3, %4 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc60) + %9 = arith.truncf %tmp110 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked> loc(#loc145) + %10 = ttg.convert_layout %9 : tensor<64x64xbf16, #blocked> -> tensor<64x64xbf16, #blocked1> loc(#loc145) + tt.store %8, %10, %tmp50_59 : tensor<64x64x!tt.ptr, #blocked1> loc(#loc145) + } loc(#loc61) + tt.return loc(#loc146) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc156 = loc("xoffset"(#loc2)) +#loc157 = loc("xoffset"(#loc3)) +#loc158 = loc("xindex"(#loc4)) +#loc159 = loc("xindex"(#loc5)) +#loc160 = loc("r0_base"(#loc6)) +#loc161 = loc("x0"(#loc7)) +#loc162 = loc("x1"(#loc8)) +#loc163 = loc("tmp0"(#loc9)) +#loc164 = loc("tmp0"(#loc10)) +#loc165 = loc("tmp0"(#loc11)) +#loc166 = loc("tmp0"(#loc12)) +#loc167 = loc("tmp0"(#loc13)) +#loc168 = loc("_tmp4"(#loc14)) +#loc169 = loc("r0_index"(#loc15)) +#loc170 = loc("r0_mask"(#loc16)) +#loc171 = loc("tmp0"(#loc17)) +#loc172 = loc("tmp0"(#loc18)) +#loc173 = loc("tmp0"(#loc19)) +#loc174 = loc("tmp6"(#loc20)) +#loc175 = loc("tmp6"(#loc21)) +#loc176 = loc("tmp6"(#loc22)) +#loc177 = loc("tmp6"(#loc23)) +#loc178 = loc("tmp6"(#loc24)) +#loc179 = loc("tmp2"(#loc25)) +#loc180 = loc("tmp5"(#loc26)) +#loc181 = loc("_tmp4"(#loc27)) +#loc182 = loc("tmp8"(#loc28)) +#loc183 = loc("tmp11"(#loc29)) +#loc184 = loc("_tmp10"(#loc30)) +#loc186 = loc("tmp4"(#loc35)) +#loc188 = loc("tmp10"(#loc37)) +#loc189 = loc("tmp50"(#loc38)) +#loc190 = loc("tmp50"(#loc39)) +#loc191 = loc("tmp50"(#loc40)) +#loc192 = loc("tmp50"(#loc41)) +#loc193 = loc("tmp50"(#loc42)) +#loc194 = loc("tmp58"(#loc43)) +#loc195 = loc("tmp63"(#loc44)) +#loc196 = loc("tmp63"(#loc45)) +#loc197 = loc("tmp63"(#loc46)) +#loc198 = loc("tmp66"(#loc47)) +#loc199 = loc("tmp102"(#loc48)) +#loc200 = loc("tmp20"(#loc49)) +#loc201 = loc("tmp22"(#loc50)) +#loc202 = loc("tmp23"(#loc51)) +#loc203 = loc("tmp24"(#loc52)) +#loc204 = loc("tmp72"(#loc53)) +#loc205 = loc("tmp73"(#loc54)) +#loc206 = loc("tmp74"(#loc55)) +#loc207 = loc("tmp75"(#loc56)) +#loc208 = loc("r0_index"(#loc62)) +#loc209 = loc("r0_mask"(#loc63)) +#loc210 = loc("r0_3"(#loc64)) +#loc211 = loc("r0_4"(#loc65)) +#loc212 = loc("tmp50"(#loc66)) +#loc213 = loc("tmp50"(#loc67)) +#loc214 = loc("tmp58"(#loc68)) +#loc215 = loc("tmp58"(#loc69)) +#loc216 = loc("tmp63"(#loc70)) +#loc217 = loc("tmp66"(#loc71)) +#loc218 = loc("tmp96"(#loc72)) +#loc219 = loc("tmp96"(#loc73)) +#loc220 = loc("tmp96"(#loc74)) +#loc221 = loc("tmp96"(#loc75)) +#loc222 = loc("tmp96"(#loc76)) +#loc223 = loc("tmp96"(#loc77)) +#loc224 = loc("tmp102"(#loc78)) +#loc225 = loc("tmp102"(#loc79)) +#loc226 = loc("tmp16"(#loc80)) +#loc227 = loc("tmp17"(#loc81)) +#loc228 = loc("tmp17"(#loc82)) +#loc229 = loc("tmp17"(#loc83)) +#loc230 = loc("tmp17"(#loc84)) +#loc231 = loc("tmp17"(#loc85)) +#loc232 = loc("tmp17"(#loc86)) +#loc233 = loc("tmp17"(#loc87)) +#loc234 = loc("tmp17"(#loc88)) +#loc235 = loc("tmp25"(#loc89)) +#loc236 = loc("tmp25"(#loc90)) +#loc237 = loc("tmp25"(#loc91)) +#loc238 = loc("tmp27"(#loc92)) +#loc239 = loc("tmp29"(#loc93)) +#loc240 = loc("tmp31"(#loc94)) +#loc241 = loc("tmp32"(#loc95)) +#loc242 = loc("tmp35"(#loc96)) +#loc243 = loc("tmp35"(#loc97)) +#loc244 = loc("tmp35"(#loc98)) +#loc245 = loc("tmp35"(#loc99)) +#loc246 = loc("tmp35"(#loc100)) +#loc247 = loc("tmp35"(#loc101)) +#loc248 = loc("tmp42"(#loc102)) +#loc249 = loc("tmp43"(#loc103)) +#loc250 = loc("tmp43"(#loc104)) +#loc251 = loc("tmp43"(#loc105)) +#loc252 = loc("tmp45"(#loc106)) +#loc253 = loc("tmp48"(#loc107)) +#loc254 = loc("tmp49"(#loc108)) +#loc255 = loc("tmp57"(#loc109)) +#loc256 = loc("tmp60"(#loc110)) +#loc257 = loc("tmp64"(#loc111)) +#loc258 = loc("tmp67"(#loc112)) +#loc259 = loc("tmp68"(#loc113)) +#loc260 = loc("tmp70"(#loc114)) +#loc261 = loc("tmp70"(#loc115)) +#loc262 = loc("tmp70"(#loc116)) +#loc263 = loc("tmp70"(#loc117)) +#loc264 = loc("tmp70"(#loc118)) +#loc265 = loc("tmp70"(#loc119)) +#loc266 = loc("tmp76"(#loc120)) +#loc267 = loc("tmp76"(#loc121)) +#loc268 = loc("tmp76"(#loc122)) +#loc269 = loc("tmp78"(#loc123)) +#loc270 = loc("tmp80"(#loc124)) +#loc271 = loc("tmp83"(#loc125)) +#loc272 = loc("tmp83"(#loc126)) +#loc273 = loc("tmp83"(#loc127)) +#loc274 = loc("tmp83"(#loc128)) +#loc275 = loc("tmp83"(#loc129)) +#loc276 = loc("tmp83"(#loc130)) +#loc277 = loc("tmp88"(#loc131)) +#loc278 = loc("tmp89"(#loc132)) +#loc279 = loc("tmp89"(#loc133)) +#loc280 = loc("tmp89"(#loc134)) +#loc281 = loc("tmp91"(#loc135)) +#loc282 = loc("tmp94"(#loc136)) +#loc283 = loc("tmp95"(#loc137)) +#loc284 = loc("tmp82"(#loc138)) +#loc285 = loc("tmp101"(#loc139)) +#loc286 = loc("tmp104"(#loc140)) +#loc287 = loc("tmp107"(#loc141)) +#loc288 = loc("tmp109"(#loc142)) +#loc289 = loc("tmp110"(#loc143)) +#loc290 = loc("_tmp10"(#loc168)) +#loc291 = loc(callsite(#loc32 at #loc185)) +#loc293 = loc(callsite(#loc32 at #loc187)) +#loc295 = loc(fused[#loc254, #loc240]) +#loc296 = loc(fused[#loc283, #loc284]) +#loc297 = loc(callsite(#loc34 at #loc291)) +#loc298 = loc(callsite(#loc34 at #loc293)) diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c0b6bc3aae425465c1ff99634180a3fd4c3adacf --- /dev/null +++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir @@ -0,0 +1,516 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc148 = loc("in_out_ptr0"(#loc)) +#loc149 = loc("in_out_ptr1"(#loc)) +#loc150 = loc("in_ptr0"(#loc)) +#loc151 = loc("in_ptr1"(#loc)) +#loc152 = loc("in_ptr2"(#loc)) +#loc153 = loc("in_ptr3"(#loc)) +#loc154 = loc("in_ptr4"(#loc)) +#loc155 = loc("xnumel"(#loc)) +#loc156 = loc("r0_numel"(#loc)) +#loc187 = loc("tmp4"(#loc34)) +#loc189 = loc("tmp10"(#loc37)) +#loc294 = loc(callsite(#loc1 at #loc187)) +#loc296 = loc(callsite(#loc1 at #loc189)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x64xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_1 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc1) + %cst_6 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc1) + %cst_7 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc157) + %xoffset_13 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc158) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc159) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc160) + %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<64x1xi32> loc(#loc161) + %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<64x1xi32> loc(#loc161) + %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc162) + %x0 = arith.remsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc163) + %x1 = arith.divsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc164) + %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_19 = %cst_11) -> (tensor<64x64xf32>, tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc166) + %r0_index_20 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc166) + %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x64xi32> loc(#loc167) + %tmp0 = arith.addi %r0_index_20, %cst_9 : tensor<1x64xi32> loc(#loc168) + %tmp0_21 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc169) + %tmp0_22 = tt.broadcast %tmp0 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc170) + %tmp0_23 = tt.broadcast %tmp0_21 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc170) + %tmp0_24 = arith.addi %tmp0_22, %tmp0_23 : tensor<64x64xi32> loc(#loc170) + %tmp0_25 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc171) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc172) + %tmp0_27 = arith.addi %tmp0_24, %tmp0_26 : tensor<64x64xi32> loc(#loc172) + %tmp0_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc173) + %tmp0_29 = tt.addptr %tmp0_28, %tmp0_27 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc173) + %tmp0_30 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc174) + %tmp0_31 = tt.load %tmp0_29, %tmp0_30, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc174) + %tmp0_32 = arith.extf %tmp0_31 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc175) + %tmp6 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc176) + %tmp6_33 = arith.addi %tmp6, %tmp0_23 : tensor<64x64xi32> loc(#loc176) + %tmp6_34 = arith.addi %tmp6_33, %tmp0_26 : tensor<64x64xi32> loc(#loc177) + %tmp6_35 = tt.addptr %tmp0_28, %tmp6_34 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc178) + %tmp6_36 = tt.load %tmp6_35, %tmp0_30, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc179) + %tmp6_37 = arith.extf %tmp6_36 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc180) + %tmp2 = arith.mulf %tmp0_32, %tmp0_32 : tensor<64x64xf32> loc(#loc181) + %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<64x64xf32> loc(#loc182) + %_tmp4_38 = arith.select %tmp0_30, %tmp5, %_tmp4 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc183) + %tmp8 = arith.mulf %tmp6_37, %tmp6_37 : tensor<64x64xf32> loc(#loc184) + %tmp11 = arith.addf %_tmp10_19, %tmp8 : tensor<64x64xf32> loc(#loc185) + %_tmp10_39 = arith.select %tmp0_30, %tmp11, %_tmp10_19 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc186) + scf.yield %_tmp4_38, %_tmp10_39 : tensor<64x64xf32>, tensor<64x64xf32> loc(#loc32) + } loc(#loc292) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_19: f32 loc(callsite(#loc1 at #loc187)), %tmp4_20: f32 loc(callsite(#loc1 at #loc187))): + %tmp4_21 = arith.addf %tmp4_19, %tmp4_20 : f32 loc(#loc297) + tt.reduce.return %tmp4_21 : f32 loc(#loc293) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc293) + %tmp4_17 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc188) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_19: f32 loc(callsite(#loc1 at #loc189)), %tmp10_20: f32 loc(callsite(#loc1 at #loc189))): + %tmp10_21 = arith.addf %tmp10_19, %tmp10_20 : f32 loc(#loc298) + tt.reduce.return %tmp10_21 : f32 loc(#loc295) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc295) + %tmp10_18 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc190) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc191) + %r0_index_19 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc191) + %r0_mask = arith.cmpi slt, %r0_index_19, %cst_10 : tensor<1x64xi32> loc(#loc192) + %r0_3 = arith.remsi %r0_index_19, %cst_6 : tensor<1x64xi32> loc(#loc193) + %r0_4 = arith.divsi %r0_index_19, %cst_6 : tensor<1x64xi32> loc(#loc194) + %tmp50 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc195) + %tmp50_20 = tt.broadcast %r0_index_19 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc196) + %tmp50_21 = tt.broadcast %tmp50 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc196) + %tmp50_22 = arith.addi %tmp50_20, %tmp50_21 : tensor<64x64xi32> loc(#loc196) + %tmp50_23 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc197) + %tmp50_24 = tt.broadcast %tmp50_23 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc198) + %tmp50_25 = arith.addi %tmp50_22, %tmp50_24 : tensor<64x64xi32> loc(#loc198) + %tmp50_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc199) + %tmp50_27 = tt.addptr %tmp50_26, %tmp50_25 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc199) + %tmp50_28 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc200) + %tmp50_29 = tt.load %tmp50_27, %tmp50_28, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc200) + %tmp50_30 = arith.extf %tmp50_29 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc201) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc202) + %tmp58_31 = tt.addptr %tmp58, %r0_index_19 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc202) + %tmp58_32 = tt.load %tmp58_31, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc203) + %tmp58_33 = arith.extf %tmp58_32 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc204) + %tmp63 = arith.muli %x1, %cst_8 : tensor<64x1xi32> loc(#loc205) + %tmp63_34 = tt.broadcast %tmp63 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc206) + %tmp63_35 = arith.addi %tmp50_20, %tmp63_34 : tensor<64x64xi32> loc(#loc206) + %tmp63_36 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc207) + %tmp63_37 = tt.addptr %tmp63_36, %tmp63_35 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc207) + %tmp63_38 = tt.load %tmp63_37, %tmp50_28, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc208) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc209) + %tmp66_39 = tt.addptr %tmp66, %tmp63_35 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc209) + %tmp66_40 = tt.load %tmp66_39, %tmp50_28, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc210) + %tmp96 = arith.addi %r0_index_19, %cst_9 : tensor<1x64xi32> loc(#loc211) + %tmp96_41 = tt.broadcast %tmp96 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc212) + %tmp96_42 = arith.addi %tmp96_41, %tmp50_21 : tensor<64x64xi32> loc(#loc212) + %tmp96_43 = arith.addi %tmp96_42, %tmp50_24 : tensor<64x64xi32> loc(#loc213) + %tmp96_44 = tt.addptr %tmp50_26, %tmp96_43 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc214) + %tmp96_45 = tt.load %tmp96_44, %tmp50_28, %cst_0 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc215) + %tmp96_46 = arith.extf %tmp96_45 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc216) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc217) + %tmp102_47 = tt.addptr %tmp102, %r0_index_19 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc217) + %tmp102_48 = tt.load %tmp102_47, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc218) + %tmp102_49 = arith.extf %tmp102_48 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc219) + %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc220) + %tmp16_50 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc220) + %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x64xi32> loc(#loc221) + %tmp17_51 = arith.addi %tmp17, %cst_4 : tensor<1x64xi32> loc(#loc222) + %tmp17_52 = tt.broadcast %tmp17_51 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc223) + %tmp17_53 = arith.addi %tmp17_52, %tmp50_21 : tensor<64x64xi32> loc(#loc223) + %tmp17_54 = arith.addi %tmp17_53, %tmp50_24 : tensor<64x64xi32> loc(#loc224) + %tmp17_55 = tt.addptr %tmp50_26, %tmp17_54 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc225) + %tmp17_56 = arith.andi %r0_mask, %tmp16_50 : tensor<1x64xi1> loc(#loc226) + %tmp17_57 = tt.broadcast %tmp17_56 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc227) + %tmp17_58 = tt.load %tmp17_55, %tmp17_57, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc227) + %tmp17_59 = arith.extf %tmp17_58 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc228) + %tmp20 = arith.divf %tmp10_18, %cst_3 : tensor<64x1xf32> loc(#loc229) + %tmp22 = arith.addf %tmp20, %cst_2 : tensor<64x1xf32> loc(#loc230) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc231) + %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc232) + %tmp24_60 = arith.mulf %tmp17_59, %tmp24 : tensor<64x64xf32> loc(#loc232) + %tmp25 = tt.addptr %tmp58, %tmp17_51 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc233) + %tmp25_61 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr> -> tensor<64x64x!tt.ptr> loc(#loc233) + %tmp25_62 = tt.load %tmp25_61, %tmp17_57, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc234) + %tmp25_63 = arith.extf %tmp25_62 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc235) + %tmp27 = arith.mulf %tmp24_60, %tmp25_63 : tensor<64x64xf32> loc(#loc236) + %tmp29 = arith.subf %cst_11, %tmp27 : tensor<64x64xf32> loc(#loc237) + %tmp31 = tt.broadcast %tmp16_50 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc238) + %tmp31_64 = arith.select %tmp31, %tmp29, %cst_11 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc238) + %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc239) + %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc240) + %tmp35_65 = arith.addi %tmp35, %tmp50_21 : tensor<64x64xi32> loc(#loc240) + %tmp35_66 = arith.addi %tmp35_65, %tmp50_24 : tensor<64x64xi32> loc(#loc241) + %tmp35_67 = tt.addptr %tmp50_26, %tmp35_66 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc242) + %tmp35_68 = arith.andi %r0_mask, %tmp32 : tensor<1x64xi1> loc(#loc243) + %tmp35_69 = tt.broadcast %tmp35_68 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc244) + %tmp35_70 = tt.load %tmp35_67, %tmp35_69, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc244) + %tmp35_71 = arith.extf %tmp35_70 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc245) + %tmp42 = arith.mulf %tmp35_71, %tmp24 : tensor<64x64xf32> loc(#loc246) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc247) + %tmp43_72 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr> -> tensor<64x64x!tt.ptr> loc(#loc247) + %tmp43_73 = tt.load %tmp43_72, %tmp35_69, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc248) + %tmp43_74 = arith.extf %tmp43_73 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc249) + %tmp45 = arith.mulf %tmp42, %tmp43_74 : tensor<64x64xf32> loc(#loc250) + %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc251) + %tmp48_75 = arith.select %tmp48, %tmp45, %cst_11 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc251) + %tmp49 = arith.select %tmp31, %tmp31_64, %tmp48_75 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc252) + %tmp57 = arith.mulf %tmp50_30, %tmp24 : tensor<64x64xf32> loc(#loc253) + %tmp60 = tt.broadcast %tmp58_33 : tensor<1x64xf32> -> tensor<64x64xf32> loc(#loc254) + %tmp60_76 = arith.mulf %tmp57, %tmp60 : tensor<64x64xf32> loc(#loc254) + %tmp64 = arith.mulf %tmp60_76, %tmp63_38 : tensor<64x64xf32> loc(#loc255) + %tmp67 = arith.mulf %tmp49, %tmp66_40 : tensor<64x64xf32> loc(#loc256) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x64xf32> loc(#loc257) + %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32> loc(#loc258) + %tmp70_77 = tt.broadcast %tmp70 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc259) + %tmp70_78 = arith.addi %tmp70_77, %tmp50_21 : tensor<64x64xi32> loc(#loc259) + %tmp70_79 = arith.addi %tmp70_78, %tmp50_24 : tensor<64x64xi32> loc(#loc260) + %tmp70_80 = tt.addptr %tmp50_26, %tmp70_79 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc261) + %tmp70_81 = tt.load %tmp70_80, %tmp17_57, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc262) + %tmp70_82 = arith.extf %tmp70_81 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc263) + %tmp72 = arith.divf %tmp4_17, %cst_3 : tensor<64x1xf32> loc(#loc264) + %tmp73 = arith.addf %tmp72, %cst_2 : tensor<64x1xf32> loc(#loc265) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc266) + %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc267) + %tmp75_83 = arith.mulf %tmp70_82, %tmp75 : tensor<64x64xf32> loc(#loc267) + %tmp76 = tt.addptr %tmp102, %tmp17_51 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc268) + %tmp76_84 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr> -> tensor<64x64x!tt.ptr> loc(#loc268) + %tmp76_85 = tt.load %tmp76_84, %tmp17_57, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc269) + %tmp76_86 = arith.extf %tmp76_85 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc270) + %tmp78 = arith.mulf %tmp75_83, %tmp76_86 : tensor<64x64xf32> loc(#loc271) + %tmp80 = arith.subf %cst_11, %tmp78 : tensor<64x64xf32> loc(#loc272) + %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc273) + %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x64xi32> loc(#loc274) + %tmp83_87 = tt.broadcast %tmp83 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc275) + %tmp83_88 = arith.addi %tmp83_87, %tmp50_21 : tensor<64x64xi32> loc(#loc275) + %tmp83_89 = arith.addi %tmp83_88, %tmp50_24 : tensor<64x64xi32> loc(#loc276) + %tmp83_90 = tt.addptr %tmp50_26, %tmp83_89 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc277) + %tmp83_91 = tt.load %tmp83_90, %tmp35_69, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc278) + %tmp83_92 = arith.extf %tmp83_91 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc279) + %tmp88 = arith.mulf %tmp83_92, %tmp75 : tensor<64x64xf32> loc(#loc280) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc281) + %tmp89_93 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr> -> tensor<64x64x!tt.ptr> loc(#loc281) + %tmp89_94 = tt.load %tmp89_93, %tmp35_69, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc282) + %tmp89_95 = arith.extf %tmp89_94 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc283) + %tmp91 = arith.mulf %tmp88, %tmp89_95 : tensor<64x64xf32> loc(#loc284) + %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc285) + %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc286) + %tmp101 = arith.mulf %tmp96_46, %tmp75 : tensor<64x64xf32> loc(#loc287) + %tmp104 = tt.broadcast %tmp102_49 : tensor<1x64xf32> -> tensor<64x64xf32> loc(#loc288) + %tmp104_96 = arith.mulf %tmp101, %tmp104 : tensor<64x64xf32> loc(#loc288) + %tmp107 = arith.mulf %tmp104_96, %tmp63_38 : tensor<64x64xf32> loc(#loc289) + %tmp109 = arith.mulf %tmp95, %tmp66_40 : tensor<64x64xf32> loc(#loc290) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x64xf32> loc(#loc291) + %0 = arith.muli %xindex_16, %cst_8 : tensor<64x1xi32> loc(#loc141) + %1 = tt.broadcast %0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc142) + %2 = arith.addi %tmp50_20, %1 : tensor<64x64xi32> loc(#loc142) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc143) + %4 = tt.addptr %3, %2 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc143) + %5 = arith.truncf %tmp68 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc144) + tt.store %4, %5, %tmp50_28 : tensor<64x64x!tt.ptr> loc(#loc144) + %6 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc145) + %7 = tt.addptr %6, %2 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc145) + %8 = arith.truncf %tmp110 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc146) + tt.store %7, %8, %tmp50_28 : tensor<64x64x!tt.ptr> loc(#loc146) + } loc(#loc39) + tt.return loc(#loc147) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc35 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc157 = loc("xoffset"(#loc2)) +#loc158 = loc("xoffset"(#loc3)) +#loc159 = loc("xindex"(#loc4)) +#loc160 = loc("xindex"(#loc5)) +#loc161 = loc("xindex"(#loc6)) +#loc162 = loc("r0_base"(#loc7)) +#loc163 = loc("x0"(#loc8)) +#loc164 = loc("x1"(#loc9)) +#loc165 = loc("_tmp4"(#loc10)) +#loc166 = loc("r0_index"(#loc11)) +#loc167 = loc("r0_mask"(#loc12)) +#loc168 = loc("tmp0"(#loc13)) +#loc169 = loc("tmp0"(#loc14)) +#loc170 = loc("tmp0"(#loc15)) +#loc171 = loc("tmp0"(#loc16)) +#loc172 = loc("tmp0"(#loc17)) +#loc173 = loc("tmp0"(#loc18)) +#loc174 = loc("tmp0"(#loc19)) +#loc175 = loc("tmp0"(#loc20)) +#loc176 = loc("tmp6"(#loc21)) +#loc177 = loc("tmp6"(#loc22)) +#loc178 = loc("tmp6"(#loc23)) +#loc179 = loc("tmp6"(#loc24)) +#loc180 = loc("tmp6"(#loc25)) +#loc181 = loc("tmp2"(#loc26)) +#loc182 = loc("tmp5"(#loc27)) +#loc183 = loc("_tmp4"(#loc28)) +#loc184 = loc("tmp8"(#loc29)) +#loc185 = loc("tmp11"(#loc30)) +#loc186 = loc("_tmp10"(#loc31)) +#loc188 = loc("tmp4"(#loc36)) +#loc190 = loc("tmp10"(#loc38)) +#loc191 = loc("r0_index"(#loc40)) +#loc192 = loc("r0_mask"(#loc41)) +#loc193 = loc("r0_3"(#loc42)) +#loc194 = loc("r0_4"(#loc43)) +#loc195 = loc("tmp50"(#loc44)) +#loc196 = loc("tmp50"(#loc45)) +#loc197 = loc("tmp50"(#loc46)) +#loc198 = loc("tmp50"(#loc47)) +#loc199 = loc("tmp50"(#loc48)) +#loc200 = loc("tmp50"(#loc49)) +#loc201 = loc("tmp50"(#loc50)) +#loc202 = loc("tmp58"(#loc51)) +#loc203 = loc("tmp58"(#loc52)) +#loc204 = loc("tmp58"(#loc53)) +#loc205 = loc("tmp63"(#loc54)) +#loc206 = loc("tmp63"(#loc55)) +#loc207 = loc("tmp63"(#loc56)) +#loc208 = loc("tmp63"(#loc57)) +#loc209 = loc("tmp66"(#loc58)) +#loc210 = loc("tmp66"(#loc59)) +#loc211 = loc("tmp96"(#loc60)) +#loc212 = loc("tmp96"(#loc61)) +#loc213 = loc("tmp96"(#loc62)) +#loc214 = loc("tmp96"(#loc63)) +#loc215 = loc("tmp96"(#loc64)) +#loc216 = loc("tmp96"(#loc65)) +#loc217 = loc("tmp102"(#loc66)) +#loc218 = loc("tmp102"(#loc67)) +#loc219 = loc("tmp102"(#loc68)) +#loc220 = loc("tmp16"(#loc69)) +#loc221 = loc("tmp17"(#loc70)) +#loc222 = loc("tmp17"(#loc71)) +#loc223 = loc("tmp17"(#loc72)) +#loc224 = loc("tmp17"(#loc73)) +#loc225 = loc("tmp17"(#loc74)) +#loc226 = loc("tmp17"(#loc75)) +#loc227 = loc("tmp17"(#loc76)) +#loc228 = loc("tmp17"(#loc77)) +#loc229 = loc("tmp20"(#loc78)) +#loc230 = loc("tmp22"(#loc79)) +#loc231 = loc("tmp23"(#loc80)) +#loc232 = loc("tmp24"(#loc81)) +#loc233 = loc("tmp25"(#loc82)) +#loc234 = loc("tmp25"(#loc83)) +#loc235 = loc("tmp25"(#loc84)) +#loc236 = loc("tmp27"(#loc85)) +#loc237 = loc("tmp29"(#loc86)) +#loc238 = loc("tmp31"(#loc87)) +#loc239 = loc("tmp32"(#loc88)) +#loc240 = loc("tmp35"(#loc89)) +#loc241 = loc("tmp35"(#loc90)) +#loc242 = loc("tmp35"(#loc91)) +#loc243 = loc("tmp35"(#loc92)) +#loc244 = loc("tmp35"(#loc93)) +#loc245 = loc("tmp35"(#loc94)) +#loc246 = loc("tmp42"(#loc95)) +#loc247 = loc("tmp43"(#loc96)) +#loc248 = loc("tmp43"(#loc97)) +#loc249 = loc("tmp43"(#loc98)) +#loc250 = loc("tmp45"(#loc99)) +#loc251 = loc("tmp48"(#loc100)) +#loc252 = loc("tmp49"(#loc101)) +#loc253 = loc("tmp57"(#loc102)) +#loc254 = loc("tmp60"(#loc103)) +#loc255 = loc("tmp64"(#loc104)) +#loc256 = loc("tmp67"(#loc105)) +#loc257 = loc("tmp68"(#loc106)) +#loc258 = loc("tmp70"(#loc107)) +#loc259 = loc("tmp70"(#loc108)) +#loc260 = loc("tmp70"(#loc109)) +#loc261 = loc("tmp70"(#loc110)) +#loc262 = loc("tmp70"(#loc111)) +#loc263 = loc("tmp70"(#loc112)) +#loc264 = loc("tmp72"(#loc113)) +#loc265 = loc("tmp73"(#loc114)) +#loc266 = loc("tmp74"(#loc115)) +#loc267 = loc("tmp75"(#loc116)) +#loc268 = loc("tmp76"(#loc117)) +#loc269 = loc("tmp76"(#loc118)) +#loc270 = loc("tmp76"(#loc119)) +#loc271 = loc("tmp78"(#loc120)) +#loc272 = loc("tmp80"(#loc121)) +#loc273 = loc("tmp82"(#loc122)) +#loc274 = loc("tmp83"(#loc123)) +#loc275 = loc("tmp83"(#loc124)) +#loc276 = loc("tmp83"(#loc125)) +#loc277 = loc("tmp83"(#loc126)) +#loc278 = loc("tmp83"(#loc127)) +#loc279 = loc("tmp83"(#loc128)) +#loc280 = loc("tmp88"(#loc129)) +#loc281 = loc("tmp89"(#loc130)) +#loc282 = loc("tmp89"(#loc131)) +#loc283 = loc("tmp89"(#loc132)) +#loc284 = loc("tmp91"(#loc133)) +#loc285 = loc("tmp94"(#loc134)) +#loc286 = loc("tmp95"(#loc135)) +#loc287 = loc("tmp101"(#loc136)) +#loc288 = loc("tmp104"(#loc137)) +#loc289 = loc("tmp107"(#loc138)) +#loc290 = loc("tmp109"(#loc139)) +#loc291 = loc("tmp110"(#loc140)) +#loc292 = loc("_tmp10"(#loc165)) +#loc293 = loc(callsite(#loc33 at #loc187)) +#loc295 = loc(callsite(#loc33 at #loc189)) +#loc297 = loc(callsite(#loc35 at #loc293)) +#loc298 = loc(callsite(#loc35 at #loc295)) diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/__grp__triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1bcac25deb8f4c0f2844e8c053ac818a34f9ede5 --- /dev/null +++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/__grp__triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.json"}} \ No newline at end of file diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..70bf519b261812a409ae16a50350985ad0159570 Binary files /dev/null and b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.cubin differ diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.json b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..83e0ebd0e54693c260fc02493452983647f848ba --- /dev/null +++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"hash": "10ca8e4b4982bd4e4f9030475d84ce5adc1ad2d514c413139d7a54bcdc665eab", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 32, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"} \ No newline at end of file diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.llir b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..02f3ff04eebd305ef59d35ebae5455700069b478 --- /dev/null +++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.llir @@ -0,0 +1,136 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 3, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 224, !dbg !9 + %11 = lshr exact i32 %10, 5, !dbg !9 + %12 = and i32 %9, 7, !dbg !9 + %13 = or disjoint i32 %11, %8, !dbg !10 + %14 = or disjoint i32 %8, %12, !dbg !10 + %15 = shl nuw nsw i32 %9, 2, !dbg !11 + %16 = and i32 %15, 124, !dbg !11 + %17 = sdiv i32 %13, 32, !dbg !12 + %18 = mul i32 %17, 32, !dbg !13 + %.decomposed = sub i32 %13, %18, !dbg !13 + %19 = shl nsw i32 %.decomposed, 7, !dbg !14 + %20 = or disjoint i32 %19, %16, !dbg !15 + %21 = mul i32 %17, 12288, !dbg !16 + %22 = add i32 %20, %21, !dbg !17 + %23 = sext i32 %22 to i64, !dbg !18 + %24 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18 + %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %24, i64 %25, i1 true) #4, !dbg !19 + %27 = extractvalue { i32, i32 } %26, 0, !dbg !19 + %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !19 + %29 = extractvalue { i32, i32 } %26, 1, !dbg !19 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !19 + %31 = extractelement <2 x bfloat> %28, i64 0, !dbg !19 + %32 = extractelement <2 x bfloat> %28, i64 1, !dbg !19 + %33 = extractelement <2 x bfloat> %30, i64 0, !dbg !19 + %34 = extractelement <2 x bfloat> %30, i64 1, !dbg !19 + %35 = fpext bfloat %31 to float, !dbg !20 + %36 = fpext bfloat %32 to float, !dbg !20 + %37 = fpext bfloat %33 to float, !dbg !20 + %38 = fpext bfloat %34 to float, !dbg !20 + %39 = fmul float %35, %35, !dbg !21 + %40 = fmul float %36, %36, !dbg !21 + %41 = fmul float %37, %37, !dbg !21 + %42 = fmul float %38, %38, !dbg !21 + %43 = fadd float %39, %40, !dbg !22 + %44 = fadd float %41, %43, !dbg !22 + %45 = fadd float %42, %44, !dbg !22 + %46 = bitcast float %45 to i32, !dbg !25 + %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %46, i32 16, i32 31), !dbg !25 + %48 = bitcast i32 %47 to float, !dbg !25 + %49 = fadd float %45, %48, !dbg !22 + %50 = bitcast float %49 to i32, !dbg !25 + %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %50, i32 8, i32 31), !dbg !25 + %52 = bitcast i32 %51 to float, !dbg !25 + %53 = fadd float %49, %52, !dbg !22 + %54 = bitcast float %53 to i32, !dbg !25 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 4, i32 31), !dbg !25 + %56 = bitcast i32 %55 to float, !dbg !25 + %57 = fadd float %53, %56, !dbg !22 + %58 = bitcast float %57 to i32, !dbg !25 + %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %58, i32 2, i32 31), !dbg !25 + %60 = bitcast i32 %59 to float, !dbg !25 + %61 = fadd float %57, %60, !dbg !22 + %62 = bitcast float %61 to i32, !dbg !25 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 1, i32 31), !dbg !25 + %64 = bitcast i32 %63 to float, !dbg !25 + %65 = fadd float %61, %64, !dbg !22 + %66 = lshr exact i32 %10, 3, !dbg !28 + %67 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %66, !dbg !28 + store float %65, ptr addrspace(3) %67, align 4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %68 = shl nuw nsw i32 %12, 2, !dbg !28 + %69 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %68, !dbg !28 + %70 = load i32, ptr addrspace(3) %69, align 4, !dbg !28 + %71 = sext i32 %14 to i64, !dbg !29 + %72 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !29 + %73 = and i32 %9, 248, !dbg !30 + %74 = icmp eq i32 %73, 0, !dbg !30 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %70, ptr addrspace(1) %72, i1 %74) #4, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 41, scope: !4) +!16 = !DILocation(line: 38, column: 56, scope: !4) +!17 = !DILocation(line: 38, column: 50, scope: !4) +!18 = !DILocation(line: 38, column: 34, scope: !4) +!19 = !DILocation(line: 38, column: 61, scope: !4) +!20 = !DILocation(line: 38, column: 115, scope: !4) +!21 = !DILocation(line: 40, column: 22, scope: !4) +!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26) +!26 = !DILocation(line: 44, column: 25, scope: !27) +!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!28 = !DILocation(line: 44, column: 28, scope: !4) +!29 = !DILocation(line: 45, column: 25, scope: !4) +!30 = !DILocation(line: 45, column: 36, scope: !4) +!31 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..4e426c60e1b24ea0ec9550ecbdada64d09301a6b --- /dev/null +++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ptx @@ -0,0 +1,506 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_1 +.visible .entry triton_red_fused__fused_rms_norm_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5 +) +.reqntid 256 +{ + .reg .pred %p<3>; + .reg .b16 %rs<5>; + .reg .b32 %r<48>; + .reg .b64 %rd<6>; + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_red_fused__fused_rms_norm_view_1_param_0]; + ld.param.b64 %rd5, [triton_red_fused__fused_rms_norm_view_1_param_1]; +$L__tmp0: + .loc 1 23 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28 + mov.u32 %r5, %ctaid.x; + .loc 1 23 33 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33 + shl.b32 %r6, %r5, 3; + .loc 1 24 44 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44 + mov.u32 %r7, %tid.x; + and.b32 %r8, %r7, 224; + bfe.u32 %r9, %r7, 5, 3; + and.b32 %r10, %r7, 7; + .loc 1 24 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23 + or.b32 %r11, %r9, %r6; + or.b32 %r12, %r6, %r10; + .loc 1 26 37 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37 + shl.b32 %r13, %r7, 2; + and.b32 %r14, %r13, 124; + .loc 1 29 19 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19 + bfe.s32 %r15, %r5, 28, 1; + shr.u32 %r16, %r15, 27; + add.s32 %r17, %r11, %r16; + shr.u32 %r18, %r17, 5; + .loc 1 28 19 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:28:19 + and.b32 %r19, %r17, 33554400; + sub.s32 %r20, %r11, %r19; + .loc 1 38 45 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:45 + shl.b32 %r21, %r20, 7; + .loc 1 38 41 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:41 + or.b32 %r22, %r21, %r14; + .loc 1 38 50 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:50 + mad.lo.s32 %r23, %r18, 12288, %r22; + .loc 1 38 34 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34 + mad.wide.s32 %rd1, %r23, 2, %rd4; + .loc 1 38 61 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + .loc 1 38 115 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115 + cvt.f32.bf16 %r24, %rs1; + cvt.f32.bf16 %r25, %rs2; + cvt.f32.bf16 %r26, %rs3; + cvt.f32.bf16 %r27, %rs4; + .loc 1 40 22 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22 + mul.f32 %r28, %r25, %r25; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + fma.rn.f32 %r29, %r24, %r24, %r28; + fma.rn.f32 %r30, %r26, %r26, %r29; + fma.rn.f32 %r31, %r27, %r27, %r30; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r32, %r31, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r33, %r31, %r32; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r34, %r33, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r35, %r33, %r34; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r36, %r35, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r37, %r35, %r36; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r39, %r37, %r38; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r40, %r39, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r41, %r39, %r40; +$L__tmp12: + .loc 1 44 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28 + shr.u32 %r42, %r8, 3; + mov.b32 %r43, global_smem; + add.s32 %r44, %r43, %r42; + st.shared.b32 [%r44], %r41; + bar.sync 0; + shl.b32 %r45, %r10, 2; + add.s32 %r46, %r43, %r45; + ld.shared.b32 %r4, [%r46]; + .loc 1 45 25 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25 + mad.wide.s32 %rd3, %r12, 4, %rd5; + .loc 1 45 36 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36 + and.b32 %r47, %r7, 248; + setp.eq.b32 %p2, %r47, 0; + // begin inline asm + @%p2 st.global.b32 [ %rd3 + 0 ], { %r4 }; + // end inline asm + .loc 1 45 4 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4 + ret; +$L__tmp13: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 113 +.b8 105 +.b8 116 +.b8 120 +.b8 53 +.b8 104 +.b8 119 +.b8 117 +.b8 112 +.b8 107 +.b8 98 +.b8 106 +.b8 109 +.b8 99 +.b8 115 +.b8 111 +.b8 121 +.b8 107 +.b8 113 +.b8 101 +.b8 112 +.b8 122 +.b8 113 +.b8 99 +.b8 55 +.b8 122 +.b8 99 +.b8 120 +.b8 106 +.b8 99 +.b8 98 +.b8 53 +.b8 97 +.b8 99 +.b8 113 +.b8 107 +.b8 105 +.b8 55 +.b8 122 +.b8 99 +.b8 115 +.b8 106 +.b8 105 +.b8 102 +.b8 114 +.b8 110 +.b8 114 +.b8 122 +.b8 99 +.b8 114 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 113 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.source b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..ff7cf9f915b20d3434f66ead436e51762c5b04b2 --- /dev/null +++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 65536 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 8 : i32 loc(#loc49) + %xoffset_3 = arith.constant 8 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<8x128xi1> loc(#loc53) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c128_i32 = arith.constant 128 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<8x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<8x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<8x128xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<8x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<8x128xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<8x128xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<8x128xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<8x128xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<8x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x128xf32> loc("input"(#loc33))) -> tensor<8xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc34) + tt.return %0 : tensor<8xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xf32> loc(#loc37) + tt.return %1 : tensor<8xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..1a04917e573a412ea65a59022bd735e3a8366458 --- /dev/null +++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttgir @@ -0,0 +1,108 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc1 = loc(unknown) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc27 = loc("in_ptr0"(#loc)) +#loc28 = loc("out_ptr0"(#loc)) +#loc29 = loc("xnumel"(#loc)) +#loc30 = loc("r0_numel"(#loc)) +#loc49 = loc("tmp4"(#loc21)) +#loc52 = loc(callsite(#loc1 at #loc49)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc31) + %xoffset_5 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc32) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33) + %xindex_6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc33) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc33) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32, #blocked> loc(#loc34) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc34) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<8x1xi32, #blocked> loc(#loc34) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<8x1xi32, #blocked1> loc(#loc34) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35) + %x0 = arith.remsi %xindex_11, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc36) + %x1 = arith.divsi %xindex_11, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc37) + %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38) + %tmp0 = arith.muli %x0, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc39) + %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc40) + %tmp0_15 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc40) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x128xi32, #blocked> loc(#loc40) + %tmp0_17 = arith.muli %x1, %cst_1 : tensor<8x1xi32, #blocked> loc(#loc41) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc42) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<8x128xi32, #blocked> loc(#loc42) + %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked> loc(#loc43) + %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi32, #blocked> loc(#loc43) + %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc44) + %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<8x128x!tt.ptr, #blocked> loc(#loc44) + %tmp0_24 = arith.extf %tmp0_23 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc45) + %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<8x128xf32, #blocked> loc(#loc46) + %tmp5 = arith.addf %tmp2, %cst_4 : tensor<8x128xf32, #blocked> loc(#loc47) + %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc48) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))): + %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53) + tt.reduce.return %tmp4_29 : f32 loc(#loc51) + }) : (tensor<8x128xf32, #blocked>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51) + %tmp4_25 = ttg.convert_layout %tmp4 : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50) + %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc50) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked1> loc(#loc24) + %1 = tt.addptr %0, %xindex_12 : tensor<8x1x!tt.ptr, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc24) + tt.store %1, %tmp4_26 : tensor<8x1x!tt.ptr, #blocked1> loc(#loc25) + tt.return loc(#loc26) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc31 = loc("xoffset"(#loc2)) +#loc32 = loc("xoffset"(#loc3)) +#loc33 = loc("xindex"(#loc4)) +#loc34 = loc("xindex"(#loc5)) +#loc35 = loc("r0_base"(#loc6)) +#loc36 = loc("x0"(#loc7)) +#loc37 = loc("x1"(#loc8)) +#loc38 = loc("r0_mask"(#loc9)) +#loc39 = loc("tmp0"(#loc10)) +#loc40 = loc("tmp0"(#loc11)) +#loc41 = loc("tmp0"(#loc12)) +#loc42 = loc("tmp0"(#loc13)) +#loc43 = loc("tmp0"(#loc14)) +#loc44 = loc("tmp0"(#loc15)) +#loc45 = loc("tmp0"(#loc16)) +#loc46 = loc("tmp2"(#loc17)) +#loc47 = loc("tmp5"(#loc18)) +#loc48 = loc("_tmp4"(#loc19)) +#loc50 = loc("tmp4"(#loc23)) +#loc51 = loc(callsite(#loc20 at #loc49)) +#loc53 = loc(callsite(#loc22 at #loc51)) diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..1adeb6b9b8cc2ffcd18ba5d1da5475760b71c0fd --- /dev/null +++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttir @@ -0,0 +1,105 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc2 = loc(unknown) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc29 = loc("in_ptr0"(#loc)) +#loc30 = loc("out_ptr0"(#loc)) +#loc31 = loc("xnumel"(#loc)) +#loc32 = loc("r0_numel"(#loc)) +#loc53 = loc("tmp4"(#loc23)) +#loc56 = loc(callsite(#loc2 at #loc53)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16> loc(#loc33) + %cst = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc2) + %cst_0 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc2) + %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc2) + %cst_3 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc2) + %c8_i32 = arith.constant 8 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_4 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc36) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc37) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc38) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc38) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40) + %x0 = arith.remsi %xindex_7, %cst_3 : tensor<8x1xi32> loc(#loc41) + %x1 = arith.divsi %xindex_7, %cst_3 : tensor<8x1xi32> loc(#loc42) + %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43) + %tmp0_9 = arith.muli %x0, %cst_0 : tensor<8x1xi32> loc(#loc44) + %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc45) + %tmp0_11 = tt.broadcast %tmp0_9 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc45) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<8x128xi32> loc(#loc45) + %tmp0_13 = arith.muli %x1, %cst : tensor<8x1xi32> loc(#loc46) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc47) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<8x128xi32> loc(#loc47) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc48) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc48) + %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc33) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc33) + %tmp0_20 = arith.extf %tmp0_19 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc49) + %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<8x128xf32> loc(#loc50) + %tmp5 = arith.addf %tmp2, %cst_2 : tensor<8x128xf32> loc(#loc51) + %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc52) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))): + %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57) + tt.reduce.return %tmp4_24 : f32 loc(#loc55) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc55) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc54) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc26) + %1 = tt.addptr %0, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc26) + tt.store %1, %tmp4_21 : tensor<8x1x!tt.ptr> loc(#loc27) + tt.return loc(#loc28) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc33 = loc("tmp0"(#loc1)) +#loc34 = loc("xoffset"(#loc3)) +#loc35 = loc("xoffset"(#loc4)) +#loc36 = loc("xindex"(#loc5)) +#loc37 = loc("xindex"(#loc6)) +#loc38 = loc("xindex"(#loc7)) +#loc39 = loc("r0_base"(#loc8)) +#loc40 = loc("r0_base"(#loc9)) +#loc41 = loc("x0"(#loc10)) +#loc42 = loc("x1"(#loc11)) +#loc43 = loc("r0_mask"(#loc12)) +#loc44 = loc("tmp0"(#loc13)) +#loc45 = loc("tmp0"(#loc14)) +#loc46 = loc("tmp0"(#loc15)) +#loc47 = loc("tmp0"(#loc16)) +#loc48 = loc("tmp0"(#loc17)) +#loc49 = loc("tmp0"(#loc18)) +#loc50 = loc("tmp2"(#loc19)) +#loc51 = loc("tmp5"(#loc20)) +#loc52 = loc("_tmp4"(#loc21)) +#loc54 = loc("tmp4"(#loc25)) +#loc55 = loc(callsite(#loc22 at #loc53)) +#loc57 = loc(callsite(#loc24 at #loc55)) diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/__grp__triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3d88bd0450098b079e90ecc9b3951d5c6a5e4d5e --- /dev/null +++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/__grp__triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.json"}} \ No newline at end of file diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..b723a7e5a84d80ffa8521c60de48e498cabb7cc8 Binary files /dev/null and b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.cubin differ diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.json b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c2493755f701964bbb1485f4f3da23a47155b0d8 --- /dev/null +++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"hash": "148596f4fe2c8bab1adec0d1740cced2c5152fc51d191c7e9c9a8f38a0c27f30", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 64, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"} \ No newline at end of file diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.llir b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..3758792f084c13163ef06cef2e8c29759a22293a --- /dev/null +++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.llir @@ -0,0 +1,167 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 4, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 120, !dbg !9 + %11 = lshr exact i32 %10, 3, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = shl nuw nsw i32 %9, 2, !dbg !11 + %14 = and i32 %13, 28, !dbg !11 + %15 = sdiv i32 %12, 32, !dbg !12 + %16 = mul i32 %15, 32, !dbg !13 + %.decomposed = sub i32 %12, %16, !dbg !13 + %17 = shl nsw i32 %.decomposed, 7, !dbg !14 + %18 = mul i32 %15, 12288, !dbg !15 + %19 = or disjoint i32 %17, %14 + %20 = add i32 %19, %18 + %21 = sext i32 %20 to i64, !dbg !16 + %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !16 + %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %24 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %22, i64 %23, i1 true) #4, !dbg !17 + %25 = extractvalue { i32, i32 } %24, 0, !dbg !17 + %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17 + %27 = extractvalue { i32, i32 } %24, 1, !dbg !17 + %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !17 + %29 = sext i32 %20 to i64, !dbg !16 + %30 = getelementptr bfloat, ptr addrspace(1) %0, i64 %29, !dbg !16 + %31 = getelementptr i8, ptr addrspace(1) %30, i64 64, !dbg !16 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %33 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %31, i64 %32, i1 true) #4, !dbg !17 + %34 = extractvalue { i32, i32 } %33, 0, !dbg !17 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !17 + %36 = extractvalue { i32, i32 } %33, 1, !dbg !17 + %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !17 + %38 = sext i32 %20 to i64, !dbg !16 + %39 = getelementptr bfloat, ptr addrspace(1) %0, i64 %38, !dbg !16 + %40 = getelementptr i8, ptr addrspace(1) %39, i64 128, !dbg !16 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %42 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %40, i64 %41, i1 true) #4, !dbg !17 + %43 = extractvalue { i32, i32 } %42, 0, !dbg !17 + %44 = bitcast i32 %43 to <2 x bfloat>, !dbg !17 + %45 = extractvalue { i32, i32 } %42, 1, !dbg !17 + %46 = bitcast i32 %45 to <2 x bfloat>, !dbg !17 + %47 = sext i32 %20 to i64, !dbg !16 + %48 = getelementptr bfloat, ptr addrspace(1) %0, i64 %47, !dbg !16 + %49 = getelementptr i8, ptr addrspace(1) %48, i64 192, !dbg !16 + %50 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %49, i64 %50, i1 true) #4, !dbg !17 + %52 = extractvalue { i32, i32 } %51, 0, !dbg !17 + %53 = bitcast i32 %52 to <2 x bfloat>, !dbg !17 + %54 = extractvalue { i32, i32 } %51, 1, !dbg !17 + %55 = bitcast i32 %54 to <2 x bfloat>, !dbg !17 + %56 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !18 + %57 = fmul <2 x float> %56, %56, !dbg !19 + %58 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !18 + %59 = fmul <2 x float> %58, %58, !dbg !19 + %60 = fadd <2 x float> %57, %59, !dbg !20 + %61 = fpext <2 x bfloat> %44 to <2 x float>, !dbg !18 + %62 = fmul <2 x float> %61, %61, !dbg !19 + %63 = fadd <2 x float> %60, %62, !dbg !20 + %64 = fpext <2 x bfloat> %53 to <2 x float>, !dbg !18 + %65 = fmul <2 x float> %64, %64, !dbg !19 + %66 = fadd <2 x float> %63, %65, !dbg !20 + %67 = fpext <2 x bfloat> %28 to <2 x float>, !dbg !18 + %68 = fmul <2 x float> %67, %67, !dbg !19 + %69 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !18 + %70 = fmul <2 x float> %69, %69, !dbg !19 + %71 = fadd <2 x float> %68, %70, !dbg !20 + %72 = fpext <2 x bfloat> %46 to <2 x float>, !dbg !18 + %73 = fmul <2 x float> %72, %72, !dbg !19 + %74 = fadd <2 x float> %71, %73, !dbg !20 + %75 = fpext <2 x bfloat> %55 to <2 x float>, !dbg !18 + %76 = fmul <2 x float> %75, %75, !dbg !19 + %77 = fadd <2 x float> %74, %76, !dbg !20 + %78 = and i32 %9, 15, !dbg !9 + %79 = or disjoint i32 %8, %78, !dbg !10 + %shift = shufflevector <2 x float> %66, <2 x float> poison, <2 x i32> , !dbg !21 + %foldExtExtBinop = fadd <2 x float> %66, %shift, !dbg !21 + %foldExtExtBinop5 = fadd <2 x float> %77, %foldExtExtBinop, !dbg !21 + %shift7 = shufflevector <2 x float> %77, <2 x float> poison, <2 x i32> , !dbg !21 + %foldExtExtBinop8 = fadd <2 x float> %shift7, %foldExtExtBinop5, !dbg !21 + %80 = extractelement <2 x float> %foldExtExtBinop8, i64 0, !dbg !21 + %81 = bitcast float %80 to i32, !dbg !24 + %82 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %81, i32 4, i32 31), !dbg !24 + %83 = bitcast i32 %82 to float, !dbg !24 + %84 = fadd float %80, %83, !dbg !21 + %85 = bitcast float %84 to i32, !dbg !24 + %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 2, i32 31), !dbg !24 + %87 = bitcast i32 %86 to float, !dbg !24 + %88 = fadd float %84, %87, !dbg !21 + %89 = bitcast float %88 to i32, !dbg !24 + %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 1, i32 31), !dbg !24 + %91 = bitcast i32 %90 to float, !dbg !24 + %92 = fadd float %88, %91, !dbg !21 + %93 = lshr exact i32 %10, 1, !dbg !27 + %94 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %93, !dbg !27 + store float %92, ptr addrspace(3) %94, align 4, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27 + %95 = shl nuw nsw i32 %78, 2, !dbg !27 + %96 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %95, !dbg !27 + %97 = load i32, ptr addrspace(3) %96, align 4, !dbg !27 + %98 = sext i32 %79 to i64, !dbg !28 + %99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !28 + %100 = and i32 %9, 112, !dbg !29 + %101 = icmp eq i32 %100, 0, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %97, ptr addrspace(1) %99, i1 %101) #4, !dbg !29 + ret void, !dbg !30 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 56, scope: !4) +!16 = !DILocation(line: 38, column: 34, scope: !4) +!17 = !DILocation(line: 38, column: 61, scope: !4) +!18 = !DILocation(line: 38, column: 115, scope: !4) +!19 = !DILocation(line: 40, column: 22, scope: !4) +!20 = !DILocation(line: 42, column: 23, scope: !4) +!21 = !DILocation(line: 263, column: 15, scope: !22, inlinedAt: !24) +!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0) +!23 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!24 = !DILocation(line: 293, column: 36, scope: !22, inlinedAt: !25) +!25 = !DILocation(line: 44, column: 25, scope: !26) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!27 = !DILocation(line: 44, column: 28, scope: !4) +!28 = !DILocation(line: 45, column: 25, scope: !4) +!29 = !DILocation(line: 45, column: 36, scope: !4) +!30 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..7df8aa7762d21a7b9423be449161b2124bc9de79 --- /dev/null +++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ptx @@ -0,0 +1,575 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_0 +.visible .entry triton_red_fused__fused_rms_norm_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5 +) +.reqntid 128 +{ + .reg .pred %p<3>; + .reg .b16 %rs<17>; + .reg .b32 %r<77>; + .reg .b64 %rd<12>; + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 + +// %bb.0: + ld.param.b64 %rd10, [triton_red_fused__fused_rms_norm_view_0_param_0]; + ld.param.b64 %rd11, [triton_red_fused__fused_rms_norm_view_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28 + mov.u32 %r11, %ctaid.x; + .loc 1 23 33 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33 + shl.b32 %r12, %r11, 4; + .loc 1 24 44 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44 + mov.u32 %r13, %tid.x; + and.b32 %r14, %r13, 120; + bfe.u32 %r15, %r13, 3, 4; + .loc 1 24 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23 + or.b32 %r16, %r15, %r12; + .loc 1 26 37 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37 + shl.b32 %r17, %r13, 2; + and.b32 %r18, %r17, 28; + .loc 1 29 19 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19 + bfe.s32 %r19, %r11, 27, 1; + shr.u32 %r20, %r19, 27; + add.s32 %r21, %r16, %r20; + shr.u32 %r22, %r21, 5; + .loc 1 28 19 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:28:19 + and.b32 %r23, %r21, 33554400; + sub.s32 %r24, %r16, %r23; + .loc 1 38 45 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:45 + shl.b32 %r25, %r24, 7; + or.b32 %r26, %r25, %r18; + mad.lo.s32 %r27, %r22, 12288, %r26; + .loc 1 38 34 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34 + mad.wide.s32 %rd1, %r27, 2, %rd10; + .loc 1 38 61 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + .loc 1 38 34 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34 + add.s64 %rd3, %rd1, 64; + .loc 1 38 61 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4, %r3; + mov.u32 %r5, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 38 34 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34 + add.s64 %rd5, %rd1, 128; + .loc 1 38 61 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r3; + mov.u32 %r7, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r6, %r7 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 38 34 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34 + add.s64 %rd7, %rd1, 192; + .loc 1 38 61 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61 + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r8, %r3; + mov.u32 %r9, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r8, %r9 }, [ %rd7 + 0 ], %rd8; + // end inline asm + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r28, %rs1; + cvt.f32.bf16 %r29, %rs2; + mov.b32 {%rs3, %rs4}, %r4; + cvt.f32.bf16 %r30, %rs4; + cvt.f32.bf16 %r31, %rs3; + .loc 1 40 22 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22 + mul.f32 %r32, %r31, %r31; + mul.f32 %r33, %r30, %r30; + .loc 1 42 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23 + fma.rn.f32 %r34, %r29, %r29, %r33; + fma.rn.f32 %r35, %r28, %r28, %r32; + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + mov.b32 {%rs5, %rs6}, %r6; + cvt.f32.bf16 %r36, %rs6; + cvt.f32.bf16 %r37, %rs5; + .loc 1 42 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23 + fma.rn.f32 %r38, %r37, %r37, %r35; + fma.rn.f32 %r39, %r36, %r36, %r34; + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + mov.b32 {%rs7, %rs8}, %r8; + cvt.f32.bf16 %r40, %rs7; + cvt.f32.bf16 %r41, %rs8; + .loc 1 42 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23 + fma.rn.f32 %r42, %r41, %r41, %r39; + fma.rn.f32 %r43, %r40, %r40, %r38; + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + mov.b32 {%rs9, %rs10}, %r2; + cvt.f32.bf16 %r44, %rs9; + cvt.f32.bf16 %r45, %rs10; + mov.b32 {%rs11, %rs12}, %r5; + cvt.f32.bf16 %r46, %rs12; + cvt.f32.bf16 %r47, %rs11; + .loc 1 40 22 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22 + mul.f32 %r48, %r47, %r47; + mul.f32 %r49, %r46, %r46; + .loc 1 42 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23 + fma.rn.f32 %r50, %r45, %r45, %r49; + fma.rn.f32 %r51, %r44, %r44, %r48; + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + mov.b32 {%rs13, %rs14}, %r7; + cvt.f32.bf16 %r52, %rs14; + cvt.f32.bf16 %r53, %rs13; + .loc 1 42 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23 + fma.rn.f32 %r54, %r53, %r53, %r51; + fma.rn.f32 %r55, %r52, %r52, %r50; + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + mov.b32 {%rs15, %rs16}, %r9; + cvt.f32.bf16 %r56, %rs15; + cvt.f32.bf16 %r57, %rs16; + .loc 1 42 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23 + fma.rn.f32 %r58, %r57, %r57, %r55; + fma.rn.f32 %r59, %r56, %r56, %r54; + .loc 1 24 44 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44 + and.b32 %r60, %r13, 15; + .loc 1 24 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23 + or.b32 %r61, %r12, %r60; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r62, %r43, %r42; + add.f32 %r63, %r59, %r62; + add.f32 %r64, %r58, %r63; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r65, %r64, 4, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r66, %r64, %r65; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r67, %r66, 2, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r68, %r66, %r67; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r69, %r68, 1, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r70, %r68, %r69; +$L__tmp8: + .loc 1 44 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28 + shr.u32 %r71, %r14, 1; + mov.b32 %r72, global_smem; + add.s32 %r73, %r72, %r71; + st.shared.b32 [%r73], %r70; + bar.sync 0; + shl.b32 %r74, %r60, 2; + add.s32 %r75, %r72, %r74; + ld.shared.b32 %r10, [%r75]; + .loc 1 45 25 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25 + mad.wide.s32 %rd9, %r61, 4, %rd11; + .loc 1 45 36 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36 + and.b32 %r76, %r13, 112; + setp.eq.b32 %p2, %r76, 0; + // begin inline asm + @%p2 st.global.b32 [ %rd9 + 0 ], { %r10 }; + // end inline asm + .loc 1 45 4 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 118 +.b8 121 +.b8 116 +.b8 52 +.b8 50 +.b8 55 +.b8 51 +.b8 105 +.b8 117 +.b8 51 +.b8 51 +.b8 109 +.b8 112 +.b8 101 +.b8 101 +.b8 55 +.b8 104 +.b8 98 +.b8 101 +.b8 116 +.b8 53 +.b8 106 +.b8 53 +.b8 101 +.b8 113 +.b8 52 +.b8 52 +.b8 100 +.b8 54 +.b8 102 +.b8 115 +.b8 104 +.b8 103 +.b8 119 +.b8 107 +.b8 121 +.b8 120 +.b8 107 +.b8 110 +.b8 53 +.b8 50 +.b8 103 +.b8 103 +.b8 103 +.b8 107 +.b8 105 +.b8 113 +.b8 104 +.b8 106 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 119 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.source b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..7d4df7810576bc882ca137edfa01aaa181302c79 --- /dev/null +++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 8192 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 16 : i32 loc(#loc49) + %xoffset_3 = arith.constant 16 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<16x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<16x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<16x32xi1> loc(#loc53) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<16x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<16x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<16x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<16x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<16x32xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c32_i32 = arith.constant 32 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c32_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<16x32xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x32xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x32xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x32xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<16x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<16x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x32xi32> -> tensor<16x32xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<16x32xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<16x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<16x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<16x32xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<16x32x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<16x32x!tt.ptr>, tensor<16x32xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x32xi1> -> tensor<16x32xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<16x32xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<16x32xf32> to tensor<16x32xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<16x32x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<16x32xbf16> to tensor<16x32xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<16x32xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<16x32xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x32xi1> -> tensor<16x32xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<16x32xi1>, tensor<16x32xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<16x32xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S16_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<16x32xf32>) -> tensor<16xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<16xf32> -> tensor<16x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<16x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<16x1x!tt.ptr>, tensor<16x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<16x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S16_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<16x32xf32> loc("input"(#loc33))) -> tensor<16xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<16x32xf32>) -> tensor<16xf32> loc(#loc34) + tt.return %0 : tensor<16xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<16xf32> loc(#loc37) + tt.return %1 : tensor<16xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..c9700d21ed54bc8b623b0cc4efb423bce24f12fc --- /dev/null +++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttgir @@ -0,0 +1,121 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc1 = loc(unknown) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc30 = loc("in_ptr0"(#loc)) +#loc31 = loc("out_ptr0"(#loc)) +#loc32 = loc("xnumel"(#loc)) +#loc33 = loc("r0_numel"(#loc)) +#loc54 = loc("tmp4"(#loc24)) +#loc57 = loc(callsite(#loc1 at #loc54)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x32xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<16x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<16x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<16x1xi32, #blocked> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<16x32xbf16, #blocked> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<16x32xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_5 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36) + %xindex_6 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> loc(#loc36) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xi32, #blocked1> loc(#loc36) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<16x1xi32, #blocked> loc(#loc37) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<16x1xi32, #blocked1> loc(#loc37) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<16x1xi32, #blocked> loc(#loc37) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<16x1xi32, #blocked1> loc(#loc37) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc38) + %x0 = arith.remsi %xindex_11, %cst_2 : tensor<16x1xi32, #blocked> loc(#loc39) + %x1 = arith.divsi %xindex_11, %cst_2 : tensor<16x1xi32, #blocked> loc(#loc40) + %tmp0 = arith.muli %x0, %cst_0 : tensor<16x1xi32, #blocked> loc(#loc41) + %tmp0_14 = tt.broadcast %tmp0 : tensor<16x1xi32, #blocked> -> tensor<16x32xi32, #blocked> loc(#loc42) + %tmp0_15 = arith.muli %x1, %cst_1 : tensor<16x1xi32, #blocked> loc(#loc43) + %tmp0_16 = tt.broadcast %tmp0_15 : tensor<16x1xi32, #blocked> -> tensor<16x32xi32, #blocked> loc(#loc44) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<16x32x!tt.ptr, #blocked> loc(#loc45) + %_tmp4 = scf.for %_tmp4_20 = %c0_i32 to %c128_i32 step %c32_i32 iter_args(%arg5 = %cst_4) -> (tensor<16x32xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp4_20 : i32 -> tensor<1x32xi32, #blocked> loc(#loc47) + %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x32xi32, #blocked> loc(#loc47) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst : tensor<1x32xi32, #blocked> loc(#loc48) + %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x32xi32, #blocked> -> tensor<16x32xi32, #blocked> loc(#loc42) + %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<16x32xi32, #blocked> loc(#loc42) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<16x32xi32, #blocked> loc(#loc44) + %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<16x32x!tt.ptr, #blocked>, tensor<16x32xi32, #blocked> loc(#loc45) + %tmp0_26 = tt.broadcast %r0_mask : tensor<1x32xi1, #blocked> -> tensor<16x32xi1, #blocked> loc(#loc49) + %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_3 evictionPolicy = evict_first : tensor<16x32x!tt.ptr, #blocked> loc(#loc49) + %tmp0_28 = arith.extf %tmp0_27 : tensor<16x32xbf16, #blocked> to tensor<16x32xf32, #blocked> loc(#loc50) + %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<16x32xf32, #blocked> loc(#loc51) + %tmp5 = arith.addf %arg5, %tmp2 : tensor<16x32xf32, #blocked> loc(#loc52) + %_tmp4_29 = arith.select %tmp0_26, %tmp5, %arg5 : tensor<16x32xi1, #blocked>, tensor<16x32xf32, #blocked> loc(#loc53) + scf.yield %_tmp4_29 : tensor<16x32xf32, #blocked> loc(#loc22) + } loc(#loc46) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))): + %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58) + tt.reduce.return %tmp4_22 : f32 loc(#loc56) + }) : (tensor<16x32xf32, #blocked>) -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56) + %tmp4_18 = ttg.convert_layout %tmp4 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55) + %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xf32, #blocked1> loc(#loc55) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<16x1x!tt.ptr, #blocked1> loc(#loc27) + %1 = tt.addptr %0, %xindex_12 : tensor<16x1x!tt.ptr, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc27) + tt.store %1, %tmp4_19 : tensor<16x1x!tt.ptr, #blocked1> loc(#loc28) + tt.return loc(#loc29) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc34 = loc("xoffset"(#loc2)) +#loc35 = loc("xoffset"(#loc3)) +#loc36 = loc("xindex"(#loc4)) +#loc37 = loc("xindex"(#loc5)) +#loc38 = loc("r0_base"(#loc6)) +#loc39 = loc("x0"(#loc7)) +#loc40 = loc("x1"(#loc8)) +#loc41 = loc("tmp0"(#loc9)) +#loc42 = loc("tmp0"(#loc10)) +#loc43 = loc("tmp0"(#loc11)) +#loc44 = loc("tmp0"(#loc12)) +#loc45 = loc("tmp0"(#loc13)) +#loc46 = loc("_tmp4"(#loc14)) +#loc47 = loc("r0_index"(#loc15)) +#loc48 = loc("r0_mask"(#loc16)) +#loc49 = loc("tmp0"(#loc17)) +#loc50 = loc("tmp0"(#loc18)) +#loc51 = loc("tmp2"(#loc19)) +#loc52 = loc("tmp5"(#loc20)) +#loc53 = loc("_tmp4"(#loc21)) +#loc55 = loc("tmp4"(#loc26)) +#loc56 = loc(callsite(#loc23 at #loc54)) +#loc58 = loc(callsite(#loc25 at #loc56)) diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..5c4affefef87b57008d4da1392fc66ccbe413873 --- /dev/null +++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttir @@ -0,0 +1,118 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc1 = loc(unknown) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc32 = loc("in_ptr0"(#loc)) +#loc33 = loc("out_ptr0"(#loc)) +#loc34 = loc("xnumel"(#loc)) +#loc35 = loc("r0_numel"(#loc)) +#loc58 = loc("tmp4"(#loc26)) +#loc61 = loc(callsite(#loc1 at #loc58)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<16x32xbf16> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc2) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<12288> : tensor<16x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<16x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<1x32xi32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<16x32xf32> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<16x1xi32> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc36) + %xoffset_5 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc37) + %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc38) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc39) + %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<16x1xi32> loc(#loc40) + %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<16x1xi32> loc(#loc40) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc41) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc42) + %x0 = arith.remsi %xindex_8, %cst_4 : tensor<16x1xi32> loc(#loc43) + %x1 = arith.divsi %xindex_8, %cst_4 : tensor<16x1xi32> loc(#loc44) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c32_i32 iter_args(%_tmp4_11 = %cst_3) -> (tensor<16x32xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc46) + %r0_index_12 = arith.addi %r0_index, %r0_base_9 : tensor<1x32xi32> loc(#loc46) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x32xi32> loc(#loc47) + %tmp0 = arith.muli %x0, %cst_1 : tensor<16x1xi32> loc(#loc48) + %tmp0_13 = tt.broadcast %r0_index_12 : tensor<1x32xi32> -> tensor<16x32xi32> loc(#loc49) + %tmp0_14 = tt.broadcast %tmp0 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc49) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<16x32xi32> loc(#loc49) + %tmp0_16 = arith.muli %x1, %cst_0 : tensor<16x1xi32> loc(#loc50) + %tmp0_17 = tt.broadcast %tmp0_16 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc51) + %tmp0_18 = arith.addi %tmp0_15, %tmp0_17 : tensor<16x32xi32> loc(#loc51) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<16x32x!tt.ptr> loc(#loc52) + %tmp0_20 = tt.addptr %tmp0_19, %tmp0_18 : tensor<16x32x!tt.ptr>, tensor<16x32xi32> loc(#loc52) + %tmp0_21 = tt.broadcast %r0_mask : tensor<1x32xi1> -> tensor<16x32xi1> loc(#loc53) + %tmp0_22 = tt.load %tmp0_20, %tmp0_21, %cst evictionPolicy = evict_first : tensor<16x32x!tt.ptr> loc(#loc53) + %tmp0_23 = arith.extf %tmp0_22 : tensor<16x32xbf16> to tensor<16x32xf32> loc(#loc54) + %tmp2 = arith.mulf %tmp0_23, %tmp0_23 : tensor<16x32xf32> loc(#loc55) + %tmp5 = arith.addf %_tmp4_11, %tmp2 : tensor<16x32xf32> loc(#loc56) + %_tmp4_24 = arith.select %tmp0_21, %tmp5, %_tmp4_11 : tensor<16x32xi1>, tensor<16x32xf32> loc(#loc57) + scf.yield %_tmp4_24 : tensor<16x32xf32> loc(#loc24) + } loc(#loc45) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_11: f32 loc(callsite(#loc1 at #loc58)), %tmp4_12: f32 loc(callsite(#loc1 at #loc58))): + %tmp4_13 = arith.addf %tmp4_11, %tmp4_12 : f32 loc(#loc62) + tt.reduce.return %tmp4_13 : f32 loc(#loc60) + }) : (tensor<16x32xf32>) -> tensor<16xf32> loc(#loc60) + %tmp4_10 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<16xf32> -> tensor<16x1xf32> loc(#loc59) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<16x1x!tt.ptr> loc(#loc29) + %1 = tt.addptr %0, %xindex_8 : tensor<16x1x!tt.ptr>, tensor<16x1xi32> loc(#loc29) + tt.store %1, %tmp4_10 : tensor<16x1x!tt.ptr> loc(#loc30) + tt.return loc(#loc31) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc36 = loc("xoffset"(#loc3)) +#loc37 = loc("xoffset"(#loc4)) +#loc38 = loc("xindex"(#loc5)) +#loc39 = loc("xindex"(#loc6)) +#loc40 = loc("xindex"(#loc7)) +#loc41 = loc("r0_base"(#loc8)) +#loc42 = loc("r0_base"(#loc9)) +#loc43 = loc("x0"(#loc10)) +#loc44 = loc("x1"(#loc11)) +#loc45 = loc("_tmp4"(#loc2)) +#loc46 = loc("r0_index"(#loc12)) +#loc47 = loc("r0_mask"(#loc13)) +#loc48 = loc("tmp0"(#loc14)) +#loc49 = loc("tmp0"(#loc15)) +#loc50 = loc("tmp0"(#loc16)) +#loc51 = loc("tmp0"(#loc17)) +#loc52 = loc("tmp0"(#loc18)) +#loc53 = loc("tmp0"(#loc19)) +#loc54 = loc("tmp0"(#loc20)) +#loc55 = loc("tmp2"(#loc21)) +#loc56 = loc("tmp5"(#loc22)) +#loc57 = loc("_tmp4"(#loc23)) +#loc59 = loc("tmp4"(#loc28)) +#loc60 = loc(callsite(#loc25 at #loc58)) +#loc62 = loc(callsite(#loc27 at #loc60)) diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json new file mode 100644 index 0000000000000000000000000000000000000000..91a69b37b4ccd50c04e860f4ec28f38680470589 --- /dev/null +++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.source", "triton_red_fused_add_mul_native_layer_norm_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttir", "triton_red_fused_add_mul_native_layer_norm_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttgir", "triton_red_fused_add_mul_native_layer_norm_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.llir", "triton_red_fused_add_mul_native_layer_norm_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ptx", "triton_red_fused_add_mul_native_layer_norm_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.cubin", "triton_red_fused_add_mul_native_layer_norm_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.json"}} \ No newline at end of file diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.cubin b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a0ca56e1a5a8f43f1f466021f9afeeaec0bdd343 Binary files /dev/null and b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.cubin differ diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.json b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b70a39a781f007bdcca5603ec4f9a88fc00926fe --- /dev/null +++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.json @@ -0,0 +1 @@ +{"hash": "1614d3038e63e28f92bf3f4ace7c6aa7ea5cf21d6e2067b6deca2b80ac367d58", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_1"} \ No newline at end of file diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.llir b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..3739a86fa49e0d35f289cbdc95b65bf354b3384e --- /dev/null +++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.llir @@ -0,0 +1,547 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 { +__nv_rsqrtf.exit: + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %9 = icmp samesign ult i32 %8, 256, !dbg !9 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %11 = and i32 %10, 511, !dbg !10 + %12 = and i32 %10, 31, !dbg !10 + %13 = lshr i32 %11, 5, !dbg !10 + %14 = shl nuw nsw i32 %10, 3, !dbg !10 + %15 = and i32 %14, 4088, !dbg !10 + %16 = shl i32 %8, 12, !dbg !11 + %17 = or disjoint i32 %15, %16, !dbg !12 + %18 = sext i32 %17 to i64, !dbg !13 + %19 = getelementptr bfloat, ptr addrspace(1) %0, i64 %18, !dbg !13 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %20, i1 %9) #6, !dbg !14 + %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !14 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !14 + %24 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !14 + %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !14 + %26 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !14 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !14 + %28 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !14 + %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !14 + %30 = extractelement <2 x bfloat> %23, i64 0, !dbg !14 + %31 = extractelement <2 x bfloat> %23, i64 1, !dbg !14 + %32 = extractelement <2 x bfloat> %25, i64 0, !dbg !14 + %33 = extractelement <2 x bfloat> %25, i64 1, !dbg !14 + %34 = extractelement <2 x bfloat> %27, i64 0, !dbg !14 + %35 = extractelement <2 x bfloat> %27, i64 1, !dbg !14 + %36 = extractelement <2 x bfloat> %29, i64 0, !dbg !14 + %37 = extractelement <2 x bfloat> %29, i64 1, !dbg !14 + %38 = fpext bfloat %30 to float, !dbg !15 + %39 = fpext bfloat %31 to float, !dbg !15 + %40 = fpext bfloat %32 to float, !dbg !15 + %41 = fpext bfloat %33 to float, !dbg !15 + %42 = fpext bfloat %34 to float, !dbg !15 + %43 = fpext bfloat %35 to float, !dbg !15 + %44 = fpext bfloat %36 to float, !dbg !15 + %45 = fpext bfloat %37 to float, !dbg !15 + %46 = select i1 %9, float %38, float 0.000000e+00, !dbg !16 + %47 = select i1 %9, float %39, float 0.000000e+00, !dbg !16 + %48 = select i1 %9, float %40, float 0.000000e+00, !dbg !16 + %49 = select i1 %9, float %41, float 0.000000e+00, !dbg !16 + %50 = select i1 %9, float %42, float 0.000000e+00, !dbg !16 + %51 = select i1 %9, float %43, float 0.000000e+00, !dbg !16 + %52 = select i1 %9, float %44, float 0.000000e+00, !dbg !16 + %53 = select i1 %9, float %45, float 0.000000e+00, !dbg !16 + %54 = select i1 %9, float 1.000000e+00, float 0.000000e+00, !dbg !17 + %55 = fsub float %47, %46, !dbg !18 + %56 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !24 + %57 = fcmp oeq float %56, 0.000000e+00, !dbg !25 + %58 = tail call float @llvm.nvvm.div.full(float %54, float %56), !dbg !26 + %59 = select i1 %57, float 0.000000e+00, float %58, !dbg !27 + %60 = fmul float %59, %55, !dbg !28 + %61 = fadd float %46, %60, !dbg !29 + %62 = fmul float %55, %55, !dbg !30 + %63 = fmul float %54, %62, !dbg !31 + %64 = fmul float %59, %63, !dbg !32 + %65 = fadd float %64, 0.000000e+00, !dbg !33 + %66 = fsub float %48, %61, !dbg !18 + %67 = select i1 %9, float 3.000000e+00, float 0.000000e+00, !dbg !24 + %68 = fcmp oeq float %67, 0.000000e+00, !dbg !25 + %69 = tail call float @llvm.nvvm.div.full(float %54, float %67), !dbg !26 + %70 = select i1 %68, float 0.000000e+00, float %69, !dbg !27 + %71 = fmul float %70, %66, !dbg !28 + %72 = fadd float %61, %71, !dbg !29 + %73 = fmul float %66, %66, !dbg !30 + %74 = fmul float %56, %73, !dbg !31 + %75 = fmul float %70, %74, !dbg !32 + %76 = fadd float %65, %75, !dbg !33 + %77 = fsub float %49, %72, !dbg !18 + %78 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !24 + %79 = fcmp oeq float %78, 0.000000e+00, !dbg !25 + %80 = tail call float @llvm.nvvm.div.full(float %54, float %78), !dbg !26 + %81 = select i1 %79, float 0.000000e+00, float %80, !dbg !27 + %82 = fmul float %81, %77, !dbg !28 + %83 = fadd float %72, %82, !dbg !29 + %84 = fmul float %77, %77, !dbg !30 + %85 = fmul float %67, %84, !dbg !31 + %86 = fmul float %81, %85, !dbg !32 + %87 = fadd float %76, %86, !dbg !33 + %88 = fsub float %50, %83, !dbg !18 + %89 = select i1 %9, float 5.000000e+00, float 0.000000e+00, !dbg !24 + %90 = fcmp oeq float %89, 0.000000e+00, !dbg !25 + %91 = tail call float @llvm.nvvm.div.full(float %54, float %89), !dbg !26 + %92 = select i1 %90, float 0.000000e+00, float %91, !dbg !27 + %93 = fmul float %92, %88, !dbg !28 + %94 = fadd float %83, %93, !dbg !29 + %95 = fmul float %88, %88, !dbg !30 + %96 = fmul float %78, %95, !dbg !31 + %97 = fmul float %92, %96, !dbg !32 + %98 = fadd float %87, %97, !dbg !33 + %99 = fsub float %51, %94, !dbg !18 + %100 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !24 + %101 = fcmp oeq float %100, 0.000000e+00, !dbg !25 + %102 = tail call float @llvm.nvvm.div.full(float %54, float %100), !dbg !26 + %103 = select i1 %101, float 0.000000e+00, float %102, !dbg !27 + %104 = fmul float %103, %99, !dbg !28 + %105 = fadd float %94, %104, !dbg !29 + %106 = fmul float %99, %99, !dbg !30 + %107 = fmul float %89, %106, !dbg !31 + %108 = fmul float %103, %107, !dbg !32 + %109 = fadd float %98, %108, !dbg !33 + %110 = fsub float %52, %105, !dbg !18 + %111 = select i1 %9, float 7.000000e+00, float 0.000000e+00, !dbg !24 + %112 = fcmp oeq float %111, 0.000000e+00, !dbg !25 + %113 = tail call float @llvm.nvvm.div.full(float %54, float %111), !dbg !26 + %114 = select i1 %112, float 0.000000e+00, float %113, !dbg !27 + %115 = fmul float %114, %110, !dbg !28 + %116 = fadd float %105, %115, !dbg !29 + %117 = fmul float %110, %110, !dbg !30 + %118 = fmul float %100, %117, !dbg !31 + %119 = fmul float %114, %118, !dbg !32 + %120 = fadd float %109, %119, !dbg !33 + %121 = fsub float %53, %116, !dbg !18 + %122 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !24 + %123 = fcmp oeq float %122, 0.000000e+00, !dbg !25 + %124 = tail call float @llvm.nvvm.div.full(float %54, float %122), !dbg !26 + %125 = select i1 %123, float 0.000000e+00, float %124, !dbg !27 + %126 = fmul float %125, %121, !dbg !28 + %127 = fadd float %116, %126, !dbg !29 + %128 = fmul float %121, %121, !dbg !30 + %129 = fmul float %111, %128, !dbg !31 + %130 = fmul float %125, %129, !dbg !32 + %131 = fadd float %120, %130, !dbg !33 + %132 = bitcast float %127 to i32, !dbg !21 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !21 + %134 = bitcast i32 %133 to float, !dbg !21 + %135 = bitcast float %131 to i32, !dbg !21 + %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 16, i32 31), !dbg !21 + %137 = bitcast i32 %136 to float, !dbg !21 + %138 = bitcast float %122 to i32, !dbg !21 + %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !21 + %140 = bitcast i32 %139 to float, !dbg !21 + %141 = fsub float %134, %127, !dbg !18 + %142 = fadd float %122, %140, !dbg !24 + %143 = fcmp oeq float %142, 0.000000e+00, !dbg !25 + %144 = tail call float @llvm.nvvm.div.full(float %140, float %142), !dbg !26 + %145 = select i1 %143, float 0.000000e+00, float %144, !dbg !27 + %146 = fmul float %145, %141, !dbg !28 + %147 = fadd float %127, %146, !dbg !29 + %148 = fadd float %131, %137, !dbg !34 + %149 = fmul float %141, %141, !dbg !30 + %150 = fmul float %122, %149, !dbg !31 + %151 = fmul float %145, %150, !dbg !32 + %152 = fadd float %148, %151, !dbg !33 + %153 = bitcast float %147 to i32, !dbg !21 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !21 + %155 = bitcast i32 %154 to float, !dbg !21 + %156 = bitcast float %152 to i32, !dbg !21 + %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 8, i32 31), !dbg !21 + %158 = bitcast i32 %157 to float, !dbg !21 + %159 = bitcast float %142 to i32, !dbg !21 + %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 8, i32 31), !dbg !21 + %161 = bitcast i32 %160 to float, !dbg !21 + %162 = fsub float %155, %147, !dbg !18 + %163 = fadd float %142, %161, !dbg !24 + %164 = fcmp oeq float %163, 0.000000e+00, !dbg !25 + %165 = tail call float @llvm.nvvm.div.full(float %161, float %163), !dbg !26 + %166 = select i1 %164, float 0.000000e+00, float %165, !dbg !27 + %167 = fmul float %166, %162, !dbg !28 + %168 = fadd float %147, %167, !dbg !29 + %169 = fadd float %152, %158, !dbg !34 + %170 = fmul float %162, %162, !dbg !30 + %171 = fmul float %142, %170, !dbg !31 + %172 = fmul float %166, %171, !dbg !32 + %173 = fadd float %169, %172, !dbg !33 + %174 = bitcast float %168 to i32, !dbg !21 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !21 + %176 = bitcast i32 %175 to float, !dbg !21 + %177 = bitcast float %173 to i32, !dbg !21 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 4, i32 31), !dbg !21 + %179 = bitcast i32 %178 to float, !dbg !21 + %180 = bitcast float %163 to i32, !dbg !21 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 4, i32 31), !dbg !21 + %182 = bitcast i32 %181 to float, !dbg !21 + %183 = fsub float %176, %168, !dbg !18 + %184 = fadd float %163, %182, !dbg !24 + %185 = fcmp oeq float %184, 0.000000e+00, !dbg !25 + %186 = tail call float @llvm.nvvm.div.full(float %182, float %184), !dbg !26 + %187 = select i1 %185, float 0.000000e+00, float %186, !dbg !27 + %188 = fmul float %187, %183, !dbg !28 + %189 = fadd float %168, %188, !dbg !29 + %190 = fadd float %173, %179, !dbg !34 + %191 = fmul float %183, %183, !dbg !30 + %192 = fmul float %163, %191, !dbg !31 + %193 = fmul float %187, %192, !dbg !32 + %194 = fadd float %190, %193, !dbg !33 + %195 = bitcast float %189 to i32, !dbg !21 + %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !21 + %197 = bitcast i32 %196 to float, !dbg !21 + %198 = bitcast float %194 to i32, !dbg !21 + %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 2, i32 31), !dbg !21 + %200 = bitcast i32 %199 to float, !dbg !21 + %201 = bitcast float %184 to i32, !dbg !21 + %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 2, i32 31), !dbg !21 + %203 = bitcast i32 %202 to float, !dbg !21 + %204 = fsub float %197, %189, !dbg !18 + %205 = fadd float %184, %203, !dbg !24 + %206 = fcmp oeq float %205, 0.000000e+00, !dbg !25 + %207 = tail call float @llvm.nvvm.div.full(float %203, float %205), !dbg !26 + %208 = select i1 %206, float 0.000000e+00, float %207, !dbg !27 + %209 = fmul float %208, %204, !dbg !28 + %210 = fadd float %189, %209, !dbg !29 + %211 = fadd float %194, %200, !dbg !34 + %212 = fmul float %204, %204, !dbg !30 + %213 = fmul float %184, %212, !dbg !31 + %214 = fmul float %208, %213, !dbg !32 + %215 = fadd float %211, %214, !dbg !33 + %216 = bitcast float %210 to i32, !dbg !21 + %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !21 + %218 = bitcast i32 %217 to float, !dbg !21 + %219 = bitcast float %215 to i32, !dbg !21 + %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !21 + %221 = bitcast i32 %220 to float, !dbg !21 + %222 = bitcast float %205 to i32, !dbg !21 + %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !21 + %224 = bitcast i32 %223 to float, !dbg !21 + %225 = fsub float %218, %210, !dbg !18 + %226 = fadd float %205, %224, !dbg !24 + %227 = fcmp oeq float %226, 0.000000e+00, !dbg !25 + %228 = tail call float @llvm.nvvm.div.full(float %224, float %226), !dbg !26 + %229 = select i1 %227, float 0.000000e+00, float %228, !dbg !27 + %230 = fmul float %229, %225, !dbg !28 + %231 = fadd float %210, %230, !dbg !29 + %232 = fadd float %215, %221, !dbg !34 + %233 = fmul float %225, %225, !dbg !30 + %234 = fmul float %205, %233, !dbg !31 + %235 = fmul float %229, %234, !dbg !32 + %236 = fadd float %232, %235, !dbg !33 + %237 = icmp eq i32 %12, 0, !dbg !21 + %238 = getelementptr float, ptr addrspace(3) @global_smem, i32 %13, !dbg !21 + %239 = bitcast float %231 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %238, <1 x i32> %239, i1 %237) #6, !dbg !21 + %240 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %13, !dbg !21 + %241 = bitcast float %236 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %240, <1 x i32> %241, i1 %237) #6, !dbg !21 + %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %13, !dbg !21 + %243 = bitcast float %226 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %243, i1 %237) #6, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %244 = icmp samesign ult i32 %11, 16, !dbg !21 + %245 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !21 + %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %244) #6, !dbg !21 + %247 = bitcast i32 %246 to float, !dbg !21 + %248 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %11, !dbg !21 + %249 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %248, i1 %244) #6, !dbg !21 + %250 = bitcast i32 %249 to float, !dbg !21 + %251 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %11, !dbg !21 + %252 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %251, i1 %244) #6, !dbg !21 + %253 = bitcast i32 %252 to float, !dbg !21 + %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !21 + %255 = bitcast i32 %254 to float, !dbg !21 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 8, i32 31), !dbg !21 + %257 = bitcast i32 %256 to float, !dbg !21 + %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 8, i32 31), !dbg !21 + %259 = bitcast i32 %258 to float, !dbg !21 + %260 = fsub float %255, %247, !dbg !18 + %261 = fadd float %253, %259, !dbg !24 + %262 = fcmp oeq float %261, 0.000000e+00, !dbg !25 + %263 = tail call float @llvm.nvvm.div.full(float %259, float %261), !dbg !26 + %264 = select i1 %262, float 0.000000e+00, float %263, !dbg !27 + %265 = fmul float %260, %264, !dbg !28 + %266 = fadd float %265, %247, !dbg !29 + %267 = fadd float %250, %257, !dbg !34 + %268 = fmul float %260, %260, !dbg !30 + %269 = fmul float %268, %253, !dbg !31 + %270 = fmul float %269, %264, !dbg !32 + %271 = fadd float %267, %270, !dbg !33 + %272 = bitcast float %266 to i32, !dbg !21 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !21 + %274 = bitcast i32 %273 to float, !dbg !21 + %275 = bitcast float %271 to i32, !dbg !21 + %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %275, i32 4, i32 31), !dbg !21 + %277 = bitcast i32 %276 to float, !dbg !21 + %278 = bitcast float %261 to i32, !dbg !21 + %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 4, i32 31), !dbg !21 + %280 = bitcast i32 %279 to float, !dbg !21 + %281 = fsub float %274, %266, !dbg !18 + %282 = fadd float %261, %280, !dbg !24 + %283 = fcmp oeq float %282, 0.000000e+00, !dbg !25 + %284 = tail call float @llvm.nvvm.div.full(float %280, float %282), !dbg !26 + %285 = select i1 %283, float 0.000000e+00, float %284, !dbg !27 + %286 = fmul float %281, %285, !dbg !28 + %287 = fadd float %266, %286, !dbg !29 + %288 = fadd float %271, %277, !dbg !34 + %289 = fmul float %281, %281, !dbg !30 + %290 = fmul float %261, %289, !dbg !31 + %291 = fmul float %285, %290, !dbg !32 + %292 = fadd float %288, %291, !dbg !33 + %293 = bitcast float %287 to i32, !dbg !21 + %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !21 + %295 = bitcast i32 %294 to float, !dbg !21 + %296 = bitcast float %292 to i32, !dbg !21 + %297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 2, i32 31), !dbg !21 + %298 = bitcast i32 %297 to float, !dbg !21 + %299 = bitcast float %282 to i32, !dbg !21 + %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 2, i32 31), !dbg !21 + %301 = bitcast i32 %300 to float, !dbg !21 + %302 = fsub float %295, %287, !dbg !18 + %303 = fadd float %282, %301, !dbg !24 + %304 = fcmp oeq float %303, 0.000000e+00, !dbg !25 + %305 = tail call float @llvm.nvvm.div.full(float %301, float %303), !dbg !26 + %306 = select i1 %304, float 0.000000e+00, float %305, !dbg !27 + %307 = fmul float %302, %306, !dbg !28 + %308 = fadd float %287, %307, !dbg !29 + %309 = fadd float %292, %298, !dbg !34 + %310 = fmul float %302, %302, !dbg !30 + %311 = fmul float %282, %310, !dbg !31 + %312 = fmul float %306, %311, !dbg !32 + %313 = fadd float %309, %312, !dbg !33 + %314 = bitcast float %308 to i32, !dbg !21 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !21 + %316 = bitcast i32 %315 to float, !dbg !21 + %317 = bitcast float %313 to i32, !dbg !21 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 1, i32 31), !dbg !21 + %319 = bitcast i32 %318 to float, !dbg !21 + %320 = bitcast float %303 to i32, !dbg !21 + %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %320, i32 1, i32 31), !dbg !21 + %322 = bitcast i32 %321 to float, !dbg !21 + %323 = fsub float %316, %308, !dbg !18 + %324 = fadd float %303, %322, !dbg !24 + %325 = fcmp oeq float %324, 0.000000e+00, !dbg !25 + %326 = tail call float @llvm.nvvm.div.full(float %322, float %324), !dbg !26 + %327 = select i1 %325, float 0.000000e+00, float %326, !dbg !27 + %328 = fmul float %323, %327, !dbg !28 + %329 = fadd float %308, %328, !dbg !29 + %330 = fadd float %313, %319, !dbg !34 + %331 = fmul float %323, %323, !dbg !30 + %332 = fmul float %303, %331, !dbg !31 + %333 = fmul float %327, %332, !dbg !32 + %334 = fadd float %330, %333, !dbg !33 + %335 = and i32 %10, 15, !dbg !21 + %336 = icmp eq i32 %335, 0, !dbg !21 + %337 = and i1 %244, %336, !dbg !21 + %338 = bitcast float %329 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %338, i1 %337) #6, !dbg !21 + %339 = bitcast float %334 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %248, <1 x i32> %339, i1 %337) #6, !dbg !21 + %340 = bitcast float %324 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, <1 x i32> %340, i1 %337) #6, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %341 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !21 + %342 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !21 + %343 = zext nneg i32 %15 to i64, !dbg !35 + %344 = getelementptr bfloat, ptr addrspace(1) %1, i64 %343, !dbg !35 + %345 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !36 + %346 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %344, i64 %345, i1 true) #6, !dbg !36 + %347 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !37 + %348 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %347, i1 %9) #6, !dbg !37 + %349 = getelementptr bfloat, ptr addrspace(1) %2, i64 %343, !dbg !38 + %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !39 + %351 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !39 + %352 = tail call float @llvm.nvvm.div.full(float %342, float 4.096000e+03), !dbg !40 + %353 = fadd float %352, 0x3EB0C6F7A0000000, !dbg !41 + %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %356 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %.not.i19 = icmp eq i32 %361, 0, !dbg !42 + br i1 %.not.i19, label %364, label %362, !dbg !42 + +362: ; preds = %__nv_rsqrtf.exit + %363 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %353), !dbg !42 + br label %__nv_rsqrtf.exit21, !dbg !42 + +364: ; preds = %__nv_rsqrtf.exit + %365 = tail call float @llvm.nvvm.rsqrt.approx.f(float %353), !dbg !42 + br label %__nv_rsqrtf.exit21, !dbg !42 + +__nv_rsqrtf.exit21: ; preds = %362, %364 + %.0.i20 = phi float [ %363, %362 ], [ %365, %364 ], !dbg !42 + %366 = extractvalue { i32, i32, i32, i32 } %348, 3, !dbg !37 + %367 = bitcast i32 %366 to <2 x bfloat>, !dbg !37 + %368 = extractvalue { i32, i32, i32, i32 } %348, 2, !dbg !37 + %369 = bitcast i32 %368 to <2 x bfloat>, !dbg !37 + %370 = extractvalue { i32, i32, i32, i32 } %348, 1, !dbg !37 + %371 = bitcast i32 %370 to <2 x bfloat>, !dbg !37 + %372 = extractvalue { i32, i32, i32, i32 } %348, 0, !dbg !37 + %373 = bitcast i32 %372 to <2 x bfloat>, !dbg !37 + %374 = extractvalue { i32, i32, i32, i32 } %346, 3, !dbg !36 + %375 = bitcast i32 %374 to <2 x bfloat>, !dbg !36 + %376 = extractvalue { i32, i32, i32, i32 } %346, 2, !dbg !36 + %377 = bitcast i32 %376 to <2 x bfloat>, !dbg !36 + %378 = extractvalue { i32, i32, i32, i32 } %346, 1, !dbg !36 + %379 = bitcast i32 %378 to <2 x bfloat>, !dbg !36 + %380 = extractvalue { i32, i32, i32, i32 } %346, 0, !dbg !36 + %381 = bitcast i32 %380 to <2 x bfloat>, !dbg !36 + %382 = extractvalue { i32, i32, i32, i32 } %351, 3, !dbg !39 + %383 = bitcast i32 %382 to <2 x bfloat>, !dbg !39 + %384 = extractvalue { i32, i32, i32, i32 } %351, 2, !dbg !39 + %385 = bitcast i32 %384 to <2 x bfloat>, !dbg !39 + %386 = extractvalue { i32, i32, i32, i32 } %351, 1, !dbg !39 + %387 = bitcast i32 %386 to <2 x bfloat>, !dbg !39 + %388 = extractvalue { i32, i32, i32, i32 } %351, 0, !dbg !39 + %389 = bitcast i32 %388 to <2 x bfloat>, !dbg !39 + %390 = getelementptr bfloat, ptr addrspace(1) %3, i64 %18, !dbg !43 + %391 = fpext <2 x bfloat> %373 to <2 x float>, !dbg !44 + %392 = insertelement <2 x float> poison, float %341, i64 0, !dbg !45 + %393 = shufflevector <2 x float> %392, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !45 + %394 = fsub <2 x float> %391, %393, !dbg !45 + %395 = fpext <2 x bfloat> %381 to <2 x float>, !dbg !46 + %396 = fadd <2 x float> %395, splat (float 1.000000e+00), !dbg !47 + %397 = fpext <2 x bfloat> %389 to <2 x float>, !dbg !48 + %398 = insertelement <2 x float> poison, float %.0.i20, i64 0, !dbg !49 + %399 = shufflevector <2 x float> %398, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !49 + %400 = fmul <2 x float> %394, %399, !dbg !49 + %401 = fmul <2 x float> %396, %400, !dbg !50 + %402 = fadd <2 x float> %401, %397, !dbg !51 + %403 = fptrunc <2 x float> %402 to <2 x bfloat>, !dbg !52 + %404 = fpext <2 x bfloat> %371 to <2 x float>, !dbg !44 + %405 = fsub <2 x float> %404, %393, !dbg !45 + %406 = fpext <2 x bfloat> %379 to <2 x float>, !dbg !46 + %407 = fadd <2 x float> %406, splat (float 1.000000e+00), !dbg !47 + %408 = fpext <2 x bfloat> %387 to <2 x float>, !dbg !48 + %409 = fmul <2 x float> %405, %399, !dbg !49 + %410 = fmul <2 x float> %407, %409, !dbg !50 + %411 = fadd <2 x float> %410, %408, !dbg !51 + %412 = fptrunc <2 x float> %411 to <2 x bfloat>, !dbg !52 + %413 = fpext <2 x bfloat> %369 to <2 x float>, !dbg !44 + %414 = fsub <2 x float> %413, %393, !dbg !45 + %415 = fpext <2 x bfloat> %377 to <2 x float>, !dbg !46 + %416 = fadd <2 x float> %415, splat (float 1.000000e+00), !dbg !47 + %417 = fpext <2 x bfloat> %385 to <2 x float>, !dbg !48 + %418 = fmul <2 x float> %414, %399, !dbg !49 + %419 = fmul <2 x float> %416, %418, !dbg !50 + %420 = fadd <2 x float> %419, %417, !dbg !51 + %421 = fptrunc <2 x float> %420 to <2 x bfloat>, !dbg !52 + %422 = fpext <2 x bfloat> %367 to <2 x float>, !dbg !44 + %423 = fsub <2 x float> %422, %393, !dbg !45 + %424 = fpext <2 x bfloat> %375 to <2 x float>, !dbg !46 + %425 = fadd <2 x float> %424, splat (float 1.000000e+00), !dbg !47 + %426 = fpext <2 x bfloat> %383 to <2 x float>, !dbg !48 + %427 = fmul <2 x float> %423, %399, !dbg !49 + %428 = fmul <2 x float> %425, %427, !dbg !50 + %429 = fadd <2 x float> %428, %426, !dbg !51 + %430 = fptrunc <2 x float> %429 to <2 x bfloat>, !dbg !52 + %431 = bitcast <2 x bfloat> %403 to i32, !dbg !52 + %432 = bitcast <2 x bfloat> %412 to i32, !dbg !52 + %433 = bitcast <2 x bfloat> %421 to i32, !dbg !52 + %434 = bitcast <2 x bfloat> %430 to i32, !dbg !52 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %431, i32 %432, i32 %433, i32 %434, ptr addrspace(1) %390, i1 %9) #6, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_1", linkageName: "triton_red_fused_add_mul_native_layer_norm_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 25, column: 21, scope: !5) +!10 = !DILocation(line: 26, column: 37, scope: !5) +!11 = !DILocation(line: 38, column: 46, scope: !5) +!12 = !DILocation(line: 38, column: 41, scope: !5) +!13 = !DILocation(line: 38, column: 34, scope: !5) +!14 = !DILocation(line: 38, column: 51, scope: !5) +!15 = !DILocation(line: 38, column: 112, scope: !5) +!16 = !DILocation(line: 44, column: 62, scope: !5) +!17 = !DILocation(line: 46, column: 66, scope: !5) +!18 = !DILocation(line: 231, column: 21, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !5, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 243, column: 46, scope: !19, inlinedAt: !22) +!22 = !DILocation(line: 47, column: 79, scope: !23) +!23 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!24 = !DILocation(line: 232, column: 28, scope: !19, inlinedAt: !21) +!25 = !DILocation(line: 233, column: 39, scope: !19, inlinedAt: !21) +!26 = !DILocation(line: 233, column: 60, scope: !19, inlinedAt: !21) +!27 = !DILocation(line: 233, column: 49, scope: !19, inlinedAt: !21) +!28 = !DILocation(line: 235, column: 25, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 235, column: 17, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 236, column: 30, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 236, column: 38, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 236, column: 49, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 236, column: 22, scope: !19, inlinedAt: !21) +!34 = !DILocation(line: 236, column: 15, scope: !19, inlinedAt: !21) +!35 = !DILocation(line: 57, column: 34, scope: !5) +!36 = !DILocation(line: 57, column: 41, scope: !5) +!37 = !DILocation(line: 58, column: 52, scope: !5) +!38 = !DILocation(line: 59, column: 35, scope: !5) +!39 = !DILocation(line: 59, column: 42, scope: !5) +!40 = !DILocation(line: 65, column: 24, scope: !5) +!41 = !DILocation(line: 67, column: 24, scope: !5) +!42 = !DILocation(line: 68, column: 32, scope: !5) +!43 = !DILocation(line: 73, column: 29, scope: !5) +!44 = !DILocation(line: 58, column: 114, scope: !5) +!45 = !DILocation(line: 63, column: 24, scope: !5) +!46 = !DILocation(line: 57, column: 94, scope: !5) +!47 = !DILocation(line: 61, column: 23, scope: !5) +!48 = !DILocation(line: 59, column: 95, scope: !5) +!49 = !DILocation(line: 69, column: 24, scope: !5) +!50 = !DILocation(line: 71, column: 24, scope: !5) +!51 = !DILocation(line: 72, column: 24, scope: !5) +!52 = !DILocation(line: 73, column: 53, scope: !5) +!53 = !DILocation(line: 51, column: 4, scope: !5) diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ptx b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..e10915bf74d890b97640fea658418fe43ef03302 --- /dev/null +++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ptx @@ -0,0 +1,1032 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused_add_mul_native_layer_norm_1 // -- Begin function triton_red_fused_add_mul_native_layer_norm_1 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused_add_mul_native_layer_norm_1 +.visible .entry triton_red_fused_add_mul_native_layer_norm_1( + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_3, + .param .u32 triton_red_fused_add_mul_native_layer_norm_1_param_4, + .param .u32 triton_red_fused_add_mul_native_layer_norm_1_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_7 +) +.reqntid 512 +{ + .reg .pred %p<23>; + .reg .b16 %rs<33>; + .reg .b32 %r<287>; + .reg .b64 %rd<15>; + .loc 1 18 0 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd9, [triton_red_fused_add_mul_native_layer_norm_1_param_0]; + ld.param.b64 %rd10, [triton_red_fused_add_mul_native_layer_norm_1_param_1]; +$L__tmp0: + .loc 1 23 28 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:23:28 + mov.u32 %r37, %ctaid.x; + .loc 1 25 21 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:25:21 + setp.lt.u32 %p1, %r37, 256; + ld.param.b64 %rd11, [triton_red_fused_add_mul_native_layer_norm_1_param_2]; + ld.param.b64 %rd12, [triton_red_fused_add_mul_native_layer_norm_1_param_3]; + .loc 1 26 37 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:26:37 + mov.u32 %r38, %tid.x; + and.b32 %r39, %r38, 511; + and.b32 %r40, %r38, 31; + shl.b32 %r41, %r38, 3; + and.b32 %r42, %r41, 4088; + .loc 1 38 46 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:46 + shl.b32 %r43, %r37, 12; + .loc 1 38 41 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:41 + or.b32 %r44, %r42, %r43; + .loc 1 38 34 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:34 + mul.wide.s32 %rd13, %r44, 2; + add.s64 %rd1, %rd9, %rd13; + .loc 1 38 51 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:51 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + mov.b32 {%rs5, %rs6}, %r3; + mov.b32 {%rs7, %rs8}, %r4; + .loc 1 38 112 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112 + cvt.f32.bf16 %r45, %rs1; + cvt.f32.bf16 %r46, %rs2; + cvt.f32.bf16 %r47, %rs3; + cvt.f32.bf16 %r48, %rs4; + cvt.f32.bf16 %r49, %rs5; + cvt.f32.bf16 %r50, %rs6; + cvt.f32.bf16 %r51, %rs7; + cvt.f32.bf16 %r52, %rs8; + .loc 1 44 62 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:44:62 + selp.f32 %r53, %r45, 0f00000000, %p1; + selp.f32 %r54, %r46, 0f00000000, %p1; + selp.f32 %r55, %r47, 0f00000000, %p1; + selp.f32 %r56, %r48, 0f00000000, %p1; + selp.f32 %r57, %r49, 0f00000000, %p1; + selp.f32 %r58, %r50, 0f00000000, %p1; + selp.f32 %r59, %r51, 0f00000000, %p1; + selp.f32 %r60, %r52, 0f00000000, %p1; + .loc 1 46 66 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:46:66 + selp.f32 %r61, 0f3F800000, 0f00000000, %p1; +$L__tmp1: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r62, %r54, %r53; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r63, 0f40000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p6, %r63, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r64, %r61, %r63; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r65, 0f00000000, %r64, %p6; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r66, %r65, %r62, %r53; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r67, %r62, %r62; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r68, %r61, %r67; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r69, %r65, %r68, 0f00000000; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r70, %r55, %r66; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r71, 0f40400000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p7, %r71, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r72, %r61, %r71; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r73, 0f00000000, %r72, %p7; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r74, %r73, %r70, %r66; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r75, %r70, %r70; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r76, %r63, %r75; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r77, %r73, %r76, %r69; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r78, %r56, %r74; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r79, 0f40800000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p8, %r79, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r80, %r61, %r79; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r81, 0f00000000, %r80, %p8; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r82, %r81, %r78, %r74; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r83, %r78, %r78; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r84, %r71, %r83; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r85, %r81, %r84, %r77; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r86, %r57, %r82; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r87, 0f40A00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p9, %r87, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r88, %r61, %r87; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r89, 0f00000000, %r88, %p9; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r90, %r89, %r86, %r82; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r91, %r86, %r86; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r92, %r79, %r91; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r93, %r89, %r92, %r85; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r94, %r58, %r90; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r95, 0f40C00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p10, %r95, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r96, %r61, %r95; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r97, 0f00000000, %r96, %p10; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r98, %r97, %r94, %r90; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r99, %r94, %r94; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r100, %r87, %r99; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r101, %r97, %r100, %r93; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r102, %r59, %r98; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r103, 0f40E00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p11, %r103, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r104, %r61, %r103; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r105, 0f00000000, %r104, %p11; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r106, %r105, %r102, %r98; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r107, %r102, %r102; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r108, %r95, %r107; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r109, %r105, %r108, %r101; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r110, %r60, %r106; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r111, 0f41000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p12, %r111, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r112, %r61, %r111; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r113, 0f00000000, %r112, %p12; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r114, %r113, %r110, %r106; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r115, %r110, %r110; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r116, %r103, %r115; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r117, %r113, %r116, %r109; +$L__tmp2: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r118, %r114, 16, 31, -1; + shfl.sync.bfly.b32 %r119, %r117, 16, 31, -1; + shfl.sync.bfly.b32 %r120, %r111, 16, 31, -1; +$L__tmp3: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r121, %r118, %r114; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r122, %r111, %r120; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p13, %r122, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r123, %r120, %r122; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r124, 0f00000000, %r123, %p13; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r125, %r124, %r121, %r114; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r126, %r117, %r119; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r127, %r121, %r121; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r128, %r111, %r127; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r129, %r124, %r128, %r126; +$L__tmp4: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r130, %r125, 8, 31, -1; + shfl.sync.bfly.b32 %r131, %r129, 8, 31, -1; + shfl.sync.bfly.b32 %r132, %r122, 8, 31, -1; +$L__tmp5: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r133, %r130, %r125; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r134, %r122, %r132; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p14, %r134, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r135, %r132, %r134; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r136, 0f00000000, %r135, %p14; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r137, %r136, %r133, %r125; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r138, %r129, %r131; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r139, %r133, %r133; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r140, %r122, %r139; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r141, %r136, %r140, %r138; +$L__tmp6: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r142, %r137, 4, 31, -1; + shfl.sync.bfly.b32 %r143, %r141, 4, 31, -1; + shfl.sync.bfly.b32 %r144, %r134, 4, 31, -1; +$L__tmp7: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r145, %r142, %r137; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r146, %r134, %r144; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p15, %r146, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r147, %r144, %r146; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r148, 0f00000000, %r147, %p15; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r149, %r148, %r145, %r137; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r150, %r141, %r143; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r151, %r145, %r145; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r152, %r134, %r151; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r153, %r148, %r152, %r150; +$L__tmp8: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r154, %r149, 2, 31, -1; + shfl.sync.bfly.b32 %r155, %r153, 2, 31, -1; + shfl.sync.bfly.b32 %r156, %r146, 2, 31, -1; +$L__tmp9: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r157, %r154, %r149; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r158, %r146, %r156; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p16, %r158, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r159, %r156, %r158; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r160, 0f00000000, %r159, %p16; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r161, %r160, %r157, %r149; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r162, %r153, %r155; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r163, %r157, %r157; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r164, %r146, %r163; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r165, %r160, %r164, %r162; +$L__tmp10: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r166, %r161, 1, 31, -1; + shfl.sync.bfly.b32 %r167, %r165, 1, 31, -1; + shfl.sync.bfly.b32 %r168, %r158, 1, 31, -1; +$L__tmp11: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r169, %r166, %r161; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r11, %r158, %r168; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p17, %r11, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r170, %r168, %r11; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r171, 0f00000000, %r170, %p17; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r7, %r171, %r169, %r161; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r172, %r165, %r167; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r173, %r169, %r169; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r174, %r158, %r173; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r9, %r171, %r174, %r172; +$L__tmp12: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + setp.eq.b32 %p2, %r40, 0; + shr.u32 %r175, %r38, 3; + and.b32 %r176, %r175, 60; + mov.b32 %r177, global_smem; + add.s32 %r6, %r177, %r176; + // begin inline asm + @%p2 st.shared.b32 [ %r6 + 0 ], %r7; + // end inline asm + add.s32 %r8, %r6, 64; + // begin inline asm + @%p2 st.shared.b32 [ %r8 + 0 ], %r9; + // end inline asm + add.s32 %r10, %r6, 128; + // begin inline asm + @%p2 st.shared.b32 [ %r10 + 0 ], %r11; + // end inline asm + bar.sync 0; + setp.lt.u32 %p3, %r39, 16; + shl.b32 %r178, %r39, 2; + add.s32 %r13, %r177, %r178; + // begin inline asm + @%p3 ld.shared.b32 %r12, [ %r13 + 0 ]; + // end inline asm + add.s32 %r15, %r13, 64; + // begin inline asm + @%p3 ld.shared.b32 %r14, [ %r15 + 0 ]; + // end inline asm + add.s32 %r17, %r13, 128; + // begin inline asm + @%p3 ld.shared.b32 %r16, [ %r17 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r179, %r12, 8, 31, -1; + shfl.sync.bfly.b32 %r180, %r14, 8, 31, -1; + shfl.sync.bfly.b32 %r181, %r16, 8, 31, -1; +$L__tmp13: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r182, %r179, %r12; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r183, %r16, %r181; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p18, %r183, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r184, %r181, %r183; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r185, 0f00000000, %r184, %p18; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r186, %r182, %r185, %r12; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r187, %r14, %r180; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r188, %r182, %r182; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r189, %r188, %r16; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r190, %r189, %r185, %r187; +$L__tmp14: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r191, %r186, 4, 31, -1; + shfl.sync.bfly.b32 %r192, %r190, 4, 31, -1; + shfl.sync.bfly.b32 %r193, %r183, 4, 31, -1; +$L__tmp15: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r194, %r191, %r186; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r195, %r183, %r193; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p19, %r195, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r196, %r193, %r195; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r197, 0f00000000, %r196, %p19; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r198, %r194, %r197, %r186; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r199, %r190, %r192; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r200, %r194, %r194; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r201, %r183, %r200; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r202, %r197, %r201, %r199; +$L__tmp16: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r203, %r198, 2, 31, -1; + shfl.sync.bfly.b32 %r204, %r202, 2, 31, -1; + shfl.sync.bfly.b32 %r205, %r195, 2, 31, -1; +$L__tmp17: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r206, %r203, %r198; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r207, %r195, %r205; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p20, %r207, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r208, %r205, %r207; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r209, 0f00000000, %r208, %p20; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r210, %r206, %r209, %r198; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r211, %r202, %r204; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r212, %r206, %r206; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r213, %r195, %r212; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r214, %r209, %r213, %r211; +$L__tmp18: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + shfl.sync.bfly.b32 %r215, %r210, 1, 31, -1; + shfl.sync.bfly.b32 %r216, %r214, 1, 31, -1; + shfl.sync.bfly.b32 %r217, %r207, 1, 31, -1; +$L__tmp19: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + sub.f32 %r218, %r215, %r210; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r20, %r207, %r217; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + setp.eq.f32 %p21, %r20, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + div.full.f32 %r219, %r217, %r20; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + selp.f32 %r220, 0f00000000, %r219, %p21; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r18, %r218, %r220, %r210; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + add.f32 %r221, %r214, %r216; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r222, %r218, %r218; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + mul.f32 %r223, %r207, %r222; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ] + fma.rn.f32 %r19, %r220, %r223, %r221; +$L__tmp20: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] + and.b32 %r224, %r38, 15; + setp.eq.b32 %p22, %r224, 0; + and.pred %p4, %p3, %p22; + // begin inline asm + @%p4 st.shared.b32 [ %r13 + 0 ], %r18; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r15 + 0 ], %r19; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r17 + 0 ], %r20; + // end inline asm + bar.sync 0; + ld.shared.b32 %r225, [global_smem]; + ld.shared.b32 %r226, [global_smem+64]; +$L__tmp21: + .loc 1 57 34 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:34 + mul.wide.u32 %rd14, %r42, 2; + add.s64 %rd3, %rd10, %rd14; + .loc 1 57 41 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:41 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + mov.pred %p5, -1; + // begin inline asm + mov.u32 %r21, %r5; + mov.u32 %r22, %r5; + mov.u32 %r23, %r5; + mov.u32 %r24, %r5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r21, %r22, %r23, %r24 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 58 52 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:52 + // begin inline asm + mov.u64 %rd5, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd5, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r5; + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd1 + 0 ], %rd5; + // end inline asm + .loc 1 59 35 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:35 + add.s64 %rd6, %rd11, %rd14; + .loc 1 59 42 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:42 + // begin inline asm + mov.u64 %rd7, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r29, %r5; + mov.u32 %r30, %r5; + mov.u32 %r31, %r5; + mov.u32 %r32, %r5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r29, %r30, %r31, %r32 }, [ %rd6 + 0 ], %rd7; + // end inline asm + mov.b32 %r227, 0f45800000; + .loc 1 65 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:65:24 + div.full.f32 %r228, %r226, %r227; + .loc 1 67 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:67:24 + add.f32 %r229, %r228, 0f358637BD; + .loc 1 68 32 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:68:32 + rsqrt.approx.ftz.f32 %r230, %r229; + .loc 1 73 29 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:29 + add.s64 %rd8, %rd12, %rd13; + .loc 1 58 114 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114 + mov.b32 {%rs9, %rs10}, %r25; + cvt.f32.bf16 %r231, %rs10; + cvt.f32.bf16 %r232, %rs9; + .loc 1 63 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24 + sub.f32 %r233, %r232, %r225; + sub.f32 %r234, %r231, %r225; + .loc 1 57 94 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94 + mov.b32 {%rs11, %rs12}, %r21; + cvt.f32.bf16 %r235, %rs11; + cvt.f32.bf16 %r236, %rs12; + .loc 1 61 23 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23 + add.f32 %r237, %r236, 0f3F800000; + add.f32 %r238, %r235, 0f3F800000; + .loc 1 59 95 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95 + mov.b32 {%rs13, %rs14}, %r29; + cvt.f32.bf16 %r239, %rs14; + cvt.f32.bf16 %r240, %rs13; + .loc 1 69 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24 + mul.f32 %r241, %r234, %r230; + mul.f32 %r242, %r233, %r230; + .loc 1 72 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24 + fma.rn.f32 %r243, %r238, %r242, %r240; + fma.rn.f32 %r244, %r237, %r241, %r239; + .loc 1 73 53 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53 + cvt.rn.bf16x2.f32 %r33, %r244, %r243; + .loc 1 58 114 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114 + mov.b32 {%rs15, %rs16}, %r26; + cvt.f32.bf16 %r245, %rs16; + cvt.f32.bf16 %r246, %rs15; + .loc 1 63 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24 + sub.f32 %r247, %r246, %r225; + sub.f32 %r248, %r245, %r225; + .loc 1 57 94 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94 + mov.b32 {%rs17, %rs18}, %r22; + cvt.f32.bf16 %r249, %rs17; + cvt.f32.bf16 %r250, %rs18; + .loc 1 61 23 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23 + add.f32 %r251, %r250, 0f3F800000; + add.f32 %r252, %r249, 0f3F800000; + .loc 1 59 95 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95 + mov.b32 {%rs19, %rs20}, %r30; + cvt.f32.bf16 %r253, %rs20; + cvt.f32.bf16 %r254, %rs19; + .loc 1 69 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24 + mul.f32 %r255, %r248, %r230; + mul.f32 %r256, %r247, %r230; + .loc 1 72 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24 + fma.rn.f32 %r257, %r252, %r256, %r254; + fma.rn.f32 %r258, %r251, %r255, %r253; + .loc 1 73 53 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53 + cvt.rn.bf16x2.f32 %r34, %r258, %r257; + .loc 1 58 114 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114 + mov.b32 {%rs21, %rs22}, %r27; + cvt.f32.bf16 %r259, %rs22; + cvt.f32.bf16 %r260, %rs21; + .loc 1 63 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24 + sub.f32 %r261, %r260, %r225; + sub.f32 %r262, %r259, %r225; + .loc 1 57 94 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94 + mov.b32 {%rs23, %rs24}, %r23; + cvt.f32.bf16 %r263, %rs23; + cvt.f32.bf16 %r264, %rs24; + .loc 1 61 23 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23 + add.f32 %r265, %r264, 0f3F800000; + add.f32 %r266, %r263, 0f3F800000; + .loc 1 59 95 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95 + mov.b32 {%rs25, %rs26}, %r31; + cvt.f32.bf16 %r267, %rs26; + cvt.f32.bf16 %r268, %rs25; + .loc 1 69 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24 + mul.f32 %r269, %r262, %r230; + mul.f32 %r270, %r261, %r230; + .loc 1 72 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24 + fma.rn.f32 %r271, %r266, %r270, %r268; + fma.rn.f32 %r272, %r265, %r269, %r267; + .loc 1 73 53 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53 + cvt.rn.bf16x2.f32 %r35, %r272, %r271; + .loc 1 58 114 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114 + mov.b32 {%rs27, %rs28}, %r28; + cvt.f32.bf16 %r273, %rs28; + cvt.f32.bf16 %r274, %rs27; + .loc 1 63 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24 + sub.f32 %r275, %r274, %r225; + sub.f32 %r276, %r273, %r225; + .loc 1 57 94 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94 + mov.b32 {%rs29, %rs30}, %r24; + cvt.f32.bf16 %r277, %rs29; + cvt.f32.bf16 %r278, %rs30; + .loc 1 61 23 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23 + add.f32 %r279, %r278, 0f3F800000; + add.f32 %r280, %r277, 0f3F800000; + .loc 1 59 95 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95 + mov.b32 {%rs31, %rs32}, %r32; + cvt.f32.bf16 %r281, %rs32; + cvt.f32.bf16 %r282, %rs31; + .loc 1 69 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24 + mul.f32 %r283, %r276, %r230; + mul.f32 %r284, %r275, %r230; + .loc 1 72 24 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24 + fma.rn.f32 %r285, %r280, %r284, %r282; + fma.rn.f32 %r286, %r279, %r283, %r281; + .loc 1 73 53 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53 + cvt.rn.bf16x2.f32 %r36, %r286, %r285; + // begin inline asm + @%p1 st.global.v4.b32 [ %rd8 + 0 ], { %r33, %r34, %r35, %r36 }; + // end inline asm + .loc 1 51 4 // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:51:4 + ret; +$L__tmp22: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 343 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x150 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 97 +.b8 118 +.b8 111 +.b8 97 +.b8 122 +.b8 54 +.b8 101 +.b8 55 +.b8 107 +.b8 98 +.b8 107 +.b8 53 +.b8 119 +.b8 113 +.b8 50 +.b8 110 +.b8 55 +.b8 118 +.b8 122 +.b8 54 +.b8 114 +.b8 120 +.b8 104 +.b8 99 +.b8 114 +.b8 119 +.b8 100 +.b8 117 +.b8 50 +.b8 116 +.b8 114 +.b8 97 +.b8 122 +.b8 101 +.b8 120 +.b8 117 +.b8 98 +.b8 100 +.b8 113 +.b8 53 +.b8 113 +.b8 119 +.b8 121 +.b8 118 +.b8 50 +.b8 97 +.b8 106 +.b8 109 +.b8 98 +.b8 107 +.b8 122 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 97 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 97 +.b8 116 +.b8 105 +.b8 118 +.b8 101 +.b8 95 +.b8 108 +.b8 97 +.b8 121 +.b8 101 +.b8 114 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x113:0x47 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x128:0x31 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp21 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 47 // DW_AT_call_line +.b8 79 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x140:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp20 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 243 // DW_AT_call_line +.b8 46 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.source b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.source new file mode 100644 index 0000000000000000000000000000000000000000..c27c9fc7765e7194273ba6079eb492e76752a0cd --- /dev/null +++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.source @@ -0,0 +1,420 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0) +#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0) +#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0) +#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0) +#loc91 = loc(unknown) +#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0) +#loc109 = loc("in_ptr0"(#loc)) +#loc110 = loc("in_ptr1"(#loc)) +#loc111 = loc("in_ptr2"(#loc)) +#loc112 = loc("out_ptr2"(#loc)) +#loc113 = loc("xnumel"(#loc)) +#loc114 = loc("r0_numel"(#loc)) +#loc171 = loc("value"(#loc72)) +#loc172 = loc("mean"(#loc72)) +#loc173 = loc("m2"(#loc72)) +#loc174 = loc("weight"(#loc72)) +#loc175 = loc("first_iteration"(#loc72)) +#loc185 = loc("input"(#loc85)) +#loc186 = loc("mean"(#loc89)) +#loc187 = loc("m2"(#loc89)) +#loc188 = loc("weight"(#loc89)) +#loc189 = loc("mean_1"(#loc94)) +#loc190 = loc("m2_1"(#loc94)) +#loc191 = loc("weight_1"(#loc94)) +#loc192 = loc("mean_2"(#loc94)) +#loc193 = loc("m2_2"(#loc94)) +#loc194 = loc("weight_2"(#loc94)) +#loc201 = loc("new_mean"(#loc171)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 256 : i32 loc(#loc115) + %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116) + %xoffset = tt.get_program_id x : i32 loc(#loc117) + %xoffset_2 = arith.constant 1 : i32 loc(#loc118) + %xoffset_3 = arith.constant 1 : i32 loc(#loc118) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121) + %xmask = arith.constant dense<256> : tensor<1x1xi32> loc(#loc122) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc123) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc124) + %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc125) + %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc126) + %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc127) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c4096_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc129) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc129) + %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc130) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x4096xi32> loc(#loc130) + %tmp0 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_18 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc132) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x4096xi32> loc(#loc132) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc133) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc133) + %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc134) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x4096xi1> loc(#loc134) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc135) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc135) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc135) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc136) + %c0_i32_32 = arith.constant 0 : i32 loc(#loc23) + %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23) + %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, i1) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) loc(#loc24) + %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc137) + %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x4096xi1> loc(#loc137) + %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc138) + %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc139) + %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x4096xi1> loc(#loc139) + %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc140) + %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc141) + %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x4096xi1> loc(#loc141) + %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc142) + scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc31) + } loc(#loc207) + %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143) + %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144) + %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc36) + %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc36) + %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36) + %7 = arith.bitcast %c4096_i32_12 : i32 to i32 loc(#loc36) + %8 = ub.poison : i32 loc(#loc36) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc146) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc146) + %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc147) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x4096xi32> loc(#loc147) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc148) + %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc148) + %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149) + %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc149) + %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc149) + %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc149) + %tmp9_20 = arith.extf %tmp9_19 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc150) + %tmp12 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_21 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151) + %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151) + %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc152) + %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x4096xi32> loc(#loc152) + %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc153) + %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc153) + %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc154) + %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x4096xi1> loc(#loc154) + %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc155) + %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc155) + %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc155) + %tmp12_34 = arith.extf %tmp12_33 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc156) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc157) + %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc157) + %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc158) + %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc158) + %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc158) + %tmp23_40 = arith.extf %tmp23_39 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc159) + %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160) + %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc161) + %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x4096xf32> loc(#loc161) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc162) + %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x4096xf32> loc(#loc162) + %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163) + %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164) + %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164) + %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165) + %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166) + %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166) + %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc168) + %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x4096xf32> loc(#loc168) + %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x4096xf32> loc(#loc169) + %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x4096xf32> loc(#loc170) + %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62) + %c4096_i32_47 = arith.constant 4096 : i32 loc(#loc62) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc63) + %11 = arith.addi %r0_index_13, %10 : tensor<1x4096xi32> loc(#loc63) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc64) + %13 = tt.addptr %12, %11 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc64) + %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc65) + %15 = arith.andi %r0_mask_14, %14 : tensor<1x4096xi1> loc(#loc65) + %16 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc66) + tt.store %13, %16, %15 : tensor<1x4096x!tt.ptr> loc(#loc66) + } loc(#loc36) + tt.return loc(#loc67) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() -> tensor<1x4096xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc69) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc69) + tt.return %cst_0 : tensor<1x4096xf32> loc(#loc70) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x4096xf32> loc(#loc71) + tt.return %0 : tensor<1x4096xf32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%new_mean: tensor<1x4096xf32> loc("new_mean"(#loc171)), %mean: tensor<1x4096xf32> loc("mean"(#loc72)), %m2: tensor<1x4096xf32> loc("m2"(#loc72)), %weight: tensor<1x4096xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) attributes {noinline = false} { + %0:3 = scf.if %first_iteration -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) { + %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176) + %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc202) + %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_4096S__(%m2) : (tensor<1x4096xf32>) -> tensor<1x4096xf32> loc(#loc203) + scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc203) + } else { + %delta = arith.subf %new_mean, %mean : tensor<1x4096xf32> loc(#loc178) + %new_weight = arith.constant 1 : i32 loc(#loc179) + %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179) + %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc179) + %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x4096xf32> loc(#loc204) + %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x4096xf32> loc(#loc180) + %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x4096xf32> loc(#loc205) + %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x4096xf32> loc(#loc182) + %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x4096xf32> loc(#loc183) + %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x4096xf32> loc(#loc206) + scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc184) + } loc(#loc73) + tt.return %0#1, %0#0, %0#2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x4096xf32> loc(#loc84) + %2 = ub.poison : tensor<1x4096xf32> loc(#loc84) + %3 = ub.poison : tensor<1x4096xf32> loc(#loc84) + tt.return %1, %2, %3 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc84) + } loc(#loc72) + tt.func private @triton.language.standard.zeros_like__fp32S1_4096S__(%input: tensor<1x4096xf32> loc("input"(#loc85))) -> tensor<1x4096xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc86) + tt.return %0 : tensor<1x4096xf32> loc(#loc87) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x4096xf32> loc(#loc88) + tt.return %1 : tensor<1x4096xf32> loc(#loc88) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%mean: tensor<1x4096xf32> loc("mean"(#loc89)), %m2: tensor<1x4096xf32> loc("m2"(#loc89)), %weight: tensor<1x4096xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({ + ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): + %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90) + tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90) + }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90) + tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc93) + %2 = ub.poison : tensor<1xf32> loc(#loc93) + %3 = ub.poison : tensor<1xf32> loc(#loc93) + tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93) + } loc(#loc89) + tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} { + %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195) + %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196) + %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197) + %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197) + %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198) + %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199) + %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100) + %1 = arith.addf %mean_1, %0 : f32 loc(#loc101) + %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102) + %3 = arith.mulf %delta, %delta : f32 loc(#loc103) + %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104) + %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105) + %6 = arith.addf %2, %5 : f32 loc(#loc106) + tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107) + ^bb1: // no predecessors + %7 = ub.poison : f32 loc(#loc108) + %8 = ub.poison : f32 loc(#loc108) + %9 = ub.poison : f32 loc(#loc108) + tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108) + } loc(#loc94) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":29:45) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":30:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":31:47) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:62) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:51) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:37) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:58) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:41) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:8) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":50:16) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:43) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":52:31) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":53:29) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:47) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:42) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:62) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":60:16) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":64:16) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":66:16) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:41) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:36) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:63) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4) +#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0) +#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31) +#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11) +#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4) +#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46) +#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11) +#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4) +#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30) +#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11) +#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4) +#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11) +#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4) +#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11) +#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4) +#loc115 = loc("xnumel"(#loc1)) +#loc116 = loc("r0_numel"(#loc2)) +#loc117 = loc("xoffset"(#loc3)) +#loc118 = loc("xoffset"(#loc4)) +#loc119 = loc("xindex"(#loc5)) +#loc120 = loc("xindex"(#loc6)) +#loc121 = loc("xindex"(#loc7)) +#loc122 = loc("xmask"(#loc8)) +#loc123 = loc("r0_base"(#loc9)) +#loc124 = loc("r0_base"(#loc10)) +#loc125 = loc("tmp3_mean"(#loc11)) +#loc126 = loc("tmp3_m2"(#loc12)) +#loc127 = loc("tmp3_weight"(#loc13)) +#loc128 = loc("tmp3_mean"(#loc14)) +#loc129 = loc("r0_index"(#loc15)) +#loc130 = loc("r0_mask"(#loc16)) +#loc131 = loc("tmp0"(#loc17)) +#loc132 = loc("tmp0"(#loc18)) +#loc133 = loc("tmp0"(#loc19)) +#loc134 = loc("tmp0"(#loc20)) +#loc135 = loc("tmp0"(#loc21)) +#loc136 = loc("tmp0"(#loc22)) +#loc137 = loc("tmp3_mean"(#loc25)) +#loc138 = loc("tmp3_mean"(#loc26)) +#loc139 = loc("tmp3_m2"(#loc27)) +#loc140 = loc("tmp3_m2"(#loc28)) +#loc141 = loc("tmp3_weight"(#loc29)) +#loc142 = loc("tmp3_weight"(#loc30)) +#loc143 = loc("tmp3"(#loc33)) +#loc144 = loc("tmp7"(#loc34)) +#loc145 = loc("tmp8"(#loc35)) +#loc146 = loc("r0_index"(#loc37)) +#loc147 = loc("r0_mask"(#loc38)) +#loc148 = loc("tmp9"(#loc39)) +#loc149 = loc("tmp9"(#loc40)) +#loc150 = loc("tmp9"(#loc41)) +#loc151 = loc("tmp12"(#loc42)) +#loc152 = loc("tmp12"(#loc43)) +#loc153 = loc("tmp12"(#loc44)) +#loc154 = loc("tmp12"(#loc45)) +#loc155 = loc("tmp12"(#loc46)) +#loc156 = loc("tmp12"(#loc47)) +#loc157 = loc("tmp23"(#loc48)) +#loc158 = loc("tmp23"(#loc49)) +#loc159 = loc("tmp23"(#loc50)) +#loc160 = loc("tmp10"(#loc51)) +#loc161 = loc("tmp11"(#loc52)) +#loc162 = loc("tmp14"(#loc53)) +#loc163 = loc("tmp15"(#loc54)) +#loc164 = loc("tmp16"(#loc55)) +#loc165 = loc("tmp17"(#loc56)) +#loc166 = loc("tmp18"(#loc57)) +#loc167 = loc("tmp19"(#loc58)) +#loc168 = loc("tmp20"(#loc59)) +#loc169 = loc("tmp22"(#loc60)) +#loc170 = loc("tmp24"(#loc61)) +#loc176 = loc("new_weight"(#loc74)) +#loc177 = loc("new_m2"(#loc75)) +#loc178 = loc("delta"(#loc76)) +#loc179 = loc("new_weight"(#loc77)) +#loc180 = loc("new_mean"(#loc78)) +#loc181 = loc("new_mean"(#loc79)) +#loc182 = loc("new_m2"(#loc80)) +#loc183 = loc("new_m2"(#loc81)) +#loc184 = loc("new_m2"(#loc82)) +#loc195 = loc("delta"(#loc95)) +#loc196 = loc("new_weight"(#loc96)) +#loc197 = loc("w2_over_w"(#loc97)) +#loc198 = loc("w2_over_w"(#loc98)) +#loc199 = loc("w2_over_w"(#loc99)) +#loc200 = loc("tmp3_m2"(#loc128)) +#loc202 = loc("new_weight"(#loc176)) +#loc203 = loc("new_m2"(#loc177)) +#loc204 = loc("new_weight"(#loc179)) +#loc205 = loc("new_mean"(#loc181)) +#loc206 = loc("new_m2"(#loc184)) +#loc207 = loc("tmp3_weight"(#loc200)) diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttgir b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..c13382babf543129791f4fd5f0e7fcc3ced3b72b --- /dev/null +++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttgir @@ -0,0 +1,179 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0) +#loc1 = loc(unknown) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79) +#loc49 = loc("in_ptr0"(#loc)) +#loc50 = loc("in_ptr1"(#loc)) +#loc51 = loc("in_ptr2"(#loc)) +#loc52 = loc("out_ptr2"(#loc)) +#loc53 = loc("xnumel"(#loc)) +#loc54 = loc("r0_numel"(#loc)) +#loc68 = loc(callsite(#loc1 at #loc15)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1x4096xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16, #blocked> loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc55) + %xmask = arith.cmpi slt, %xoffset, %c256_i32 : i32 loc(#loc56) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc57) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4096xi32, #blocked> loc(#loc57) + %r0_mask = arith.cmpi slt, %r0_base_6, %cst : tensor<1x4096xi32, #blocked> loc(#loc58) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc59) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32, #blocked> loc(#loc92) + %tmp0_8 = arith.addi %r0_base_6, %tmp0_7 : tensor<1x4096xi32, #blocked> loc(#loc60) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc61) + %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc61) + %tmp0_11 = tt.splat %xmask : i1 -> tensor<1x4096xi1, #blocked> loc(#loc93) + %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x4096xi1, #blocked> loc(#loc62) + %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc63) + %tmp0_14 = arith.extf %tmp0_13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc64) + %tmp3_mean = arith.select %tmp0_12, %tmp0_14, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc65) + %tmp3_weight = arith.select %tmp0_12, %cst_5, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc66) + %0:3 = "tt.reduce"(%tmp3_mean, %cst_2, %tmp3_weight) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc15)), %arg7: f32 loc(callsite(#loc1 at #loc15)), %arg8: f32 loc(callsite(#loc1 at #loc15)), %arg9: f32 loc(callsite(#loc1 at #loc15)), %arg10: f32 loc(callsite(#loc1 at #loc15)), %arg11: f32 loc(callsite(#loc1 at #loc15))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc94) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc95) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc96) + %w2_over_w_24 = arith.divf %arg11, %new_weight : f32 loc(#loc97) + %w2_over_w_25 = arith.select %w2_over_w, %cst_1, %w2_over_w_24 : f32 loc(#loc98) + %4 = arith.mulf %delta, %w2_over_w_25 : f32 loc(#loc99) + %5 = arith.addf %arg6, %4 : f32 loc(#loc100) + %6 = arith.addf %arg7, %arg10 : f32 loc(#loc101) + %7 = arith.mulf %delta, %delta : f32 loc(#loc102) + %8 = arith.mulf %7, %arg8 : f32 loc(#loc103) + %9 = arith.mulf %8, %w2_over_w_25 : f32 loc(#loc104) + %10 = arith.addf %6, %9 : f32 loc(#loc105) + tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc67) + }) : (tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc67) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc74) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc75) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc76) + %tmp9_15 = tt.addptr %tmp9, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc76) + %tmp9_16 = tt.load %tmp9_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc77) + %tmp9_17 = arith.extf %tmp9_16 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc78) + %tmp12 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr, #blocked> loc(#loc79) + %tmp12_18 = arith.extf %tmp12 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc80) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc81) + %tmp23_19 = tt.addptr %tmp23, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc81) + %tmp23_20 = tt.load %tmp23_19, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc82) + %tmp23_21 = arith.extf %tmp23_20 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc83) + %tmp11 = arith.addf %tmp9_17, %cst_5 : tensor<1x4096xf32, #blocked> loc(#loc84) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc85) + %tmp14_22 = arith.subf %tmp12_18, %tmp14 : tensor<1x4096xf32, #blocked> loc(#loc85) + %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc86) + %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc87) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc88) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc89) + %tmp20_23 = arith.mulf %tmp14_22, %tmp20 : tensor<1x4096xf32, #blocked> loc(#loc89) + %tmp22 = arith.mulf %tmp11, %tmp20_23 : tensor<1x4096xf32, #blocked> loc(#loc90) + %tmp24 = arith.addf %tmp22, %tmp23_21 : tensor<1x4096xf32, #blocked> loc(#loc91) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc46) + %2 = tt.addptr %1, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc46) + %3 = arith.truncf %tmp24 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc47) + tt.store %2, %3, %tmp0_12 : tensor<1x4096x!tt.ptr, #blocked> loc(#loc47) + tt.return loc(#loc48) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66) +#loc14 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4) +#loc55 = loc("xoffset"(#loc2)) +#loc56 = loc("xmask"(#loc3)) +#loc57 = loc("r0_base"(#loc4)) +#loc58 = loc("r0_mask"(#loc5)) +#loc59 = loc("tmp0"(#loc6)) +#loc60 = loc("tmp0"(#loc7)) +#loc61 = loc("tmp0"(#loc8)) +#loc62 = loc("tmp0"(#loc9)) +#loc63 = loc("tmp0"(#loc10)) +#loc64 = loc("tmp0"(#loc11)) +#loc65 = loc("tmp3_mean"(#loc12)) +#loc66 = loc("tmp3_weight"(#loc13)) +#loc67 = loc(callsite(#loc14 at #loc15)) +#loc69 = loc("delta"(#loc16)) +#loc70 = loc("new_weight"(#loc17)) +#loc71 = loc("w2_over_w"(#loc18)) +#loc72 = loc("w2_over_w"(#loc19)) +#loc73 = loc("w2_over_w"(#loc20)) +#loc74 = loc("tmp3"(#loc28)) +#loc75 = loc("tmp7"(#loc29)) +#loc76 = loc("tmp9"(#loc30)) +#loc77 = loc("tmp9"(#loc31)) +#loc78 = loc("tmp9"(#loc32)) +#loc79 = loc("tmp12"(#loc33)) +#loc80 = loc("tmp12"(#loc34)) +#loc81 = loc("tmp23"(#loc35)) +#loc82 = loc("tmp23"(#loc36)) +#loc83 = loc("tmp23"(#loc37)) +#loc84 = loc("tmp11"(#loc38)) +#loc85 = loc("tmp14"(#loc39)) +#loc86 = loc("tmp16"(#loc40)) +#loc87 = loc("tmp18"(#loc41)) +#loc88 = loc("tmp19"(#loc42)) +#loc89 = loc("tmp20"(#loc43)) +#loc90 = loc("tmp22"(#loc44)) +#loc91 = loc("tmp24"(#loc45)) +#loc92 = loc(fused[#loc60, #loc59]) +#loc93 = loc(fused[#loc62, #loc56]) +#loc94 = loc(callsite(#loc69 at #loc67)) +#loc95 = loc(callsite(#loc70 at #loc67)) +#loc96 = loc(callsite(#loc71 at #loc67)) +#loc97 = loc(callsite(#loc72 at #loc67)) +#loc98 = loc(callsite(#loc73 at #loc67)) +#loc99 = loc(callsite(#loc21 at #loc67)) +#loc100 = loc(callsite(#loc22 at #loc67)) +#loc101 = loc(callsite(#loc23 at #loc67)) +#loc102 = loc(callsite(#loc24 at #loc67)) +#loc103 = loc(callsite(#loc25 at #loc67)) +#loc104 = loc(callsite(#loc26 at #loc67)) +#loc105 = loc(callsite(#loc27 at #loc67)) diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttir b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..6a8f45fd734360fe0014f0077bcc158fdd7be25a --- /dev/null +++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttir @@ -0,0 +1,180 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0) +#loc1 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("in_ptr1"(#loc)) +#loc52 = loc("in_ptr2"(#loc)) +#loc53 = loc("out_ptr2"(#loc)) +#loc54 = loc("xnumel"(#loc)) +#loc55 = loc("r0_numel"(#loc)) +#loc57 = loc(callsite(#loc1 at #loc3)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %xmask = arith.constant 256 : i32 loc(#loc56) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc57) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc1) + %cst_5 = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc58) + %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc56) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc59) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc60) + %r0_mask = arith.cmpi slt, %r0_base_7, %cst_5 : tensor<1x4096xi32> loc(#loc61) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc62) + %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32> loc(#loc94) + %tmp0_9 = arith.addi %r0_base_7, %tmp0_8 : tensor<1x4096xi32> loc(#loc63) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc64) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc64) + %tmp0_12 = tt.splat %xmask_6 : i1 -> tensor<1x4096xi1> loc(#loc95) + %tmp0_13 = arith.andi %r0_mask, %tmp0_12 : tensor<1x4096xi1> loc(#loc65) + %tmp0_14 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc66) + %tmp0_15 = arith.extf %tmp0_14 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc67) + %tmp3_mean = arith.select %tmp0_13, %tmp0_15, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc68) + %tmp3_weight = arith.select %tmp0_13, %cst_4, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc69) + %0:3 = "tt.reduce"(%tmp3_mean, %cst_0, %tmp3_weight) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc3)), %arg7: f32 loc(callsite(#loc1 at #loc3)), %arg8: f32 loc(callsite(#loc1 at #loc3)), %arg9: f32 loc(callsite(#loc1 at #loc3)), %arg10: f32 loc(callsite(#loc1 at #loc3)), %arg11: f32 loc(callsite(#loc1 at #loc3))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc96) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc97) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc98) + %w2_over_w_25 = arith.divf %arg11, %new_weight : f32 loc(#loc99) + %w2_over_w_26 = arith.select %w2_over_w, %cst, %w2_over_w_25 : f32 loc(#loc100) + %4 = arith.mulf %delta, %w2_over_w_26 : f32 loc(#loc101) + %5 = arith.addf %arg6, %4 : f32 loc(#loc102) + %6 = arith.addf %arg7, %arg10 : f32 loc(#loc103) + %7 = arith.mulf %delta, %delta : f32 loc(#loc104) + %8 = arith.mulf %7, %arg8 : f32 loc(#loc105) + %9 = arith.mulf %8, %w2_over_w_26 : f32 loc(#loc106) + %10 = arith.addf %6, %9 : f32 loc(#loc107) + tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc70) + }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc70) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc76) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc77) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc78) + %tmp9_16 = tt.addptr %tmp9, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc78) + %tmp9_17 = tt.load %tmp9_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc79) + %tmp9_18 = arith.extf %tmp9_17 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc80) + %tmp12 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc81) + %tmp12_19 = arith.extf %tmp12 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc82) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc83) + %tmp23_20 = tt.addptr %tmp23, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc83) + %tmp23_21 = tt.load %tmp23_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc84) + %tmp23_22 = arith.extf %tmp23_21 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc85) + %tmp11 = arith.addf %tmp9_18, %cst_4 : tensor<1x4096xf32> loc(#loc86) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc87) + %tmp14_23 = arith.subf %tmp12_19, %tmp14 : tensor<1x4096xf32> loc(#loc87) + %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc88) + %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc89) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc90) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc91) + %tmp20_24 = arith.mulf %tmp14_23, %tmp20 : tensor<1x4096xf32> loc(#loc91) + %tmp22 = arith.mulf %tmp11, %tmp20_24 : tensor<1x4096xf32> loc(#loc92) + %tmp24 = arith.addf %tmp22, %tmp23_22 : tensor<1x4096xf32> loc(#loc93) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc47) + %2 = tt.addptr %1, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc47) + %3 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc48) + tt.store %2, %3, %tmp0_13 : tensor<1x4096x!tt.ptr> loc(#loc48) + tt.return loc(#loc49) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:27) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66) +#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4) +#loc56 = loc("xmask"(#loc2)) +#loc58 = loc("xoffset"(#loc4)) +#loc59 = loc("r0_base"(#loc5)) +#loc60 = loc("r0_base"(#loc6)) +#loc61 = loc("r0_mask"(#loc7)) +#loc62 = loc("tmp0"(#loc8)) +#loc63 = loc("tmp0"(#loc9)) +#loc64 = loc("tmp0"(#loc10)) +#loc65 = loc("tmp0"(#loc11)) +#loc66 = loc("tmp0"(#loc12)) +#loc67 = loc("tmp0"(#loc13)) +#loc68 = loc("tmp3_mean"(#loc14)) +#loc69 = loc("tmp3_weight"(#loc15)) +#loc70 = loc(callsite(#loc16 at #loc3)) +#loc71 = loc("delta"(#loc17)) +#loc72 = loc("new_weight"(#loc18)) +#loc73 = loc("w2_over_w"(#loc19)) +#loc74 = loc("w2_over_w"(#loc20)) +#loc75 = loc("w2_over_w"(#loc21)) +#loc76 = loc("tmp3"(#loc29)) +#loc77 = loc("tmp7"(#loc30)) +#loc78 = loc("tmp9"(#loc31)) +#loc79 = loc("tmp9"(#loc32)) +#loc80 = loc("tmp9"(#loc33)) +#loc81 = loc("tmp12"(#loc34)) +#loc82 = loc("tmp12"(#loc35)) +#loc83 = loc("tmp23"(#loc36)) +#loc84 = loc("tmp23"(#loc37)) +#loc85 = loc("tmp23"(#loc38)) +#loc86 = loc("tmp11"(#loc39)) +#loc87 = loc("tmp14"(#loc40)) +#loc88 = loc("tmp16"(#loc41)) +#loc89 = loc("tmp18"(#loc42)) +#loc90 = loc("tmp19"(#loc43)) +#loc91 = loc("tmp20"(#loc44)) +#loc92 = loc("tmp22"(#loc45)) +#loc93 = loc("tmp24"(#loc46)) +#loc94 = loc(fused[#loc63, #loc62]) +#loc95 = loc(fused[#loc65, #loc56]) +#loc96 = loc(callsite(#loc71 at #loc70)) +#loc97 = loc(callsite(#loc72 at #loc70)) +#loc98 = loc(callsite(#loc73 at #loc70)) +#loc99 = loc(callsite(#loc74 at #loc70)) +#loc100 = loc(callsite(#loc75 at #loc70)) +#loc101 = loc(callsite(#loc22 at #loc70)) +#loc102 = loc(callsite(#loc23 at #loc70)) +#loc103 = loc(callsite(#loc24 at #loc70)) +#loc104 = loc(callsite(#loc25 at #loc70)) +#loc105 = loc(callsite(#loc26 at #loc70)) +#loc106 = loc(callsite(#loc27 at #loc70)) +#loc107 = loc(callsite(#loc28 at #loc70)) diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f268f3763c21c386f0eb939b260d95dd168739d0 --- /dev/null +++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.json"}} \ No newline at end of file diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..af3203563a16f6782f82d431acd4d7a46ad7e8a4 Binary files /dev/null and b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.cubin differ diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..455cfa8ed72cf59c4db3a3e504679a7e88a65112 --- /dev/null +++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"hash": "1ce4869555d6e3fa5041ee363bf7d41f87484b63ba45978158c5d6328925821c", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"} \ No newline at end of file diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..badd6098f71f776f3006457a8edf489897186498 --- /dev/null +++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.llir @@ -0,0 +1,620 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { +__nv_rsqrtf.exit: + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %12 = icmp samesign ult i32 %11, 2048, !dbg !9 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %14 = shl nuw nsw i32 %13, 2, !dbg !10 + %15 = and i32 %14, 2044, !dbg !10 + %16 = shl i32 %11, 12, !dbg !11 + %17 = zext nneg i32 %15 to i64, !dbg !12 + %18 = sext i32 %16 to i64, !dbg !12 + %19 = or disjoint i64 %17, %18, !dbg !13 + %20 = getelementptr bfloat, ptr addrspace(1) %0, i64 %19, !dbg !14 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !15 + %22 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %20, i64 %21, i1 %12) #6, !dbg !15 + %23 = getelementptr bfloat, ptr addrspace(1) %1, i64 %17, !dbg !16 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %25 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %23, i64 %24, i1 true) #6, !dbg !17 + %26 = getelementptr bfloat, ptr addrspace(1) %2, i64 %19, !dbg !18 + %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !19 + %28 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %26, i64 %27, i1 %12) #6, !dbg !19 + %29 = extractvalue { i32, i32 } %25, 1, !dbg !17 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !17 + %31 = extractvalue { i32, i32 } %28, 1, !dbg !19 + %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !19 + %33 = extractvalue { i32, i32 } %22, 1, !dbg !15 + %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !15 + %35 = extractvalue { i32, i32 } %25, 0, !dbg !17 + %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !17 + %37 = extractvalue { i32, i32 } %28, 0, !dbg !19 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !19 + %39 = extractvalue { i32, i32 } %22, 0, !dbg !15 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !15 + %41 = getelementptr bfloat, ptr addrspace(1) %5, i64 %19, !dbg !20 + %42 = fpext <2 x bfloat> %36 to <2 x float>, !dbg !21 + %43 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !22 + %44 = fmul <2 x float> %42, %43, !dbg !23 + %45 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !24 + %46 = fadd <2 x float> %44, %45, !dbg !25 + %47 = extractelement <2 x float> %46, i64 0, !dbg !26 + %48 = select i1 %12, float %47, float 0.000000e+00, !dbg !26 + %49 = extractelement <2 x float> %46, i64 1, !dbg !26 + %50 = select i1 %12, float %49, float 0.000000e+00, !dbg !26 + %51 = fptrunc <2 x float> %46 to <2 x bfloat>, !dbg !27 + %52 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !21 + %53 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !22 + %54 = fmul <2 x float> %52, %53, !dbg !23 + %55 = fpext <2 x bfloat> %34 to <2 x float>, !dbg !24 + %56 = fadd <2 x float> %54, %55, !dbg !25 + %57 = extractelement <2 x float> %56, i64 0, !dbg !26 + %58 = select i1 %12, float %57, float 0.000000e+00, !dbg !26 + %59 = extractelement <2 x float> %56, i64 1, !dbg !26 + %60 = select i1 %12, float %59, float 0.000000e+00, !dbg !26 + %61 = fptrunc <2 x float> %56 to <2 x bfloat>, !dbg !27 + %62 = bitcast <2 x bfloat> %51 to i32, !dbg !27 + %63 = bitcast <2 x bfloat> %61 to i32, !dbg !27 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %62, i32 %63, ptr addrspace(1) %41, i1 %12) #6, !dbg !27 + %64 = or disjoint i64 %17, 2048, !dbg !28 + %65 = or disjoint i64 %64, %18, !dbg !13 + %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %65, !dbg !14 + %67 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !15 + %68 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %66, i64 %67, i1 %12) #6, !dbg !15 + %69 = extractvalue { i32, i32 } %68, 0, !dbg !15 + %70 = bitcast i32 %69 to <2 x bfloat>, !dbg !15 + %71 = extractvalue { i32, i32 } %68, 1, !dbg !15 + %72 = bitcast i32 %71 to <2 x bfloat>, !dbg !15 + %73 = getelementptr bfloat, ptr addrspace(1) %1, i64 %64, !dbg !16 + %74 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %75 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %73, i64 %74, i1 true) #6, !dbg !17 + %76 = extractvalue { i32, i32 } %75, 0, !dbg !17 + %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !17 + %78 = extractvalue { i32, i32 } %75, 1, !dbg !17 + %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !17 + %80 = getelementptr bfloat, ptr addrspace(1) %2, i64 %65, !dbg !18 + %81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !19 + %82 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %80, i64 %81, i1 %12) #6, !dbg !19 + %83 = extractvalue { i32, i32 } %82, 0, !dbg !19 + %84 = bitcast i32 %83 to <2 x bfloat>, !dbg !19 + %85 = extractvalue { i32, i32 } %82, 1, !dbg !19 + %86 = bitcast i32 %85 to <2 x bfloat>, !dbg !19 + %87 = select i1 %12, float 2.000000e+00, float 1.000000e+00, !dbg !29 + %88 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29 + %89 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29 + %90 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29 + %91 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29 + %92 = getelementptr bfloat, ptr addrspace(1) %5, i64 %65, !dbg !20 + %93 = fpext <2 x bfloat> %70 to <2 x float>, !dbg !24 + %94 = fpext <2 x bfloat> %77 to <2 x float>, !dbg !21 + %95 = fpext <2 x bfloat> %84 to <2 x float>, !dbg !22 + %96 = fmul <2 x float> %94, %95, !dbg !23 + %97 = fadd <2 x float> %96, %93, !dbg !25 + %98 = extractelement <2 x float> %97, i64 0, !dbg !30 + %99 = fsub float %98, %48, !dbg !35 + %100 = tail call float @llvm.nvvm.div.full(float %99, float %87), !dbg !36 + %101 = fadd float %48, %100, !dbg !37 + %102 = fsub float %98, %101, !dbg !30 + %103 = fmul float %99, %102, !dbg !38 + %104 = fadd float %103, 0.000000e+00, !dbg !39 + %105 = extractelement <2 x float> %97, i64 1, !dbg !30 + %106 = fsub float %105, %50, !dbg !35 + %107 = tail call float @llvm.nvvm.div.full(float %106, float %87), !dbg !36 + %108 = fadd float %50, %107, !dbg !37 + %109 = fsub float %105, %108, !dbg !30 + %110 = fmul float %106, %109, !dbg !38 + %111 = fadd float %110, 0.000000e+00, !dbg !39 + %112 = select i1 %12, float %101, float 0.000000e+00, !dbg !26 + %113 = select i1 %12, float %108, float 0.000000e+00, !dbg !26 + %114 = fptrunc <2 x float> %97 to <2 x bfloat>, !dbg !27 + %115 = fpext <2 x bfloat> %72 to <2 x float>, !dbg !24 + %116 = fpext <2 x bfloat> %79 to <2 x float>, !dbg !21 + %117 = fpext <2 x bfloat> %86 to <2 x float>, !dbg !22 + %118 = fmul <2 x float> %116, %117, !dbg !23 + %119 = fadd <2 x float> %118, %115, !dbg !25 + %120 = extractelement <2 x float> %119, i64 0, !dbg !30 + %121 = fsub float %120, %58, !dbg !35 + %122 = tail call float @llvm.nvvm.div.full(float %121, float %87), !dbg !36 + %123 = fadd float %58, %122, !dbg !37 + %124 = fsub float %120, %123, !dbg !30 + %125 = fmul float %121, %124, !dbg !38 + %126 = fadd float %125, 0.000000e+00, !dbg !39 + %127 = extractelement <2 x float> %119, i64 1, !dbg !30 + %128 = fsub float %127, %60, !dbg !35 + %129 = tail call float @llvm.nvvm.div.full(float %128, float %87), !dbg !36 + %130 = fadd float %60, %129, !dbg !37 + %131 = fsub float %127, %130, !dbg !30 + %132 = fmul float %128, %131, !dbg !38 + %133 = fadd float %132, 0.000000e+00, !dbg !39 + %134 = select i1 %12, float %123, float 0.000000e+00, !dbg !26 + %135 = select i1 %12, float %130, float 0.000000e+00, !dbg !26 + %136 = select i1 %12, float %126, float 0.000000e+00, !dbg !40 + %137 = select i1 %12, float %133, float 0.000000e+00, !dbg !40 + %138 = fptrunc <2 x float> %119 to <2 x bfloat>, !dbg !27 + %139 = bitcast <2 x bfloat> %114 to i32, !dbg !27 + %140 = bitcast <2 x bfloat> %138 to i32, !dbg !27 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %139, i32 %140, ptr addrspace(1) %92, i1 %12) #6, !dbg !27 + %141 = and i32 %13, 511, !dbg !10 + %142 = and i32 %13, 31, !dbg !10 + %143 = lshr i32 %141, 5, !dbg !10 + %144 = fsub float %113, %112, !dbg !41 + %145 = select i1 %12, float 4.000000e+00, float 0.000000e+00, !dbg !44 + %146 = fcmp oeq float %145, 0.000000e+00, !dbg !45 + %147 = tail call float @llvm.nvvm.div.full(float %89, float %145), !dbg !46 + %148 = select i1 %146, float 0.000000e+00, float %147, !dbg !47 + %149 = fmul float %144, %148, !dbg !48 + %150 = fadd float %112, %149, !dbg !49 + %151 = fadd float %104, %111, !dbg !50 + %152 = select i1 %12, float %151, float 0.000000e+00, !dbg !50 + %153 = fmul float %144, %144, !dbg !51 + %154 = fmul float %153, %88, !dbg !52 + %155 = fmul float %154, %148, !dbg !53 + %156 = fadd float %152, %155, !dbg !54 + %157 = fsub float %134, %150, !dbg !41 + %158 = select i1 %12, float 6.000000e+00, float 0.000000e+00, !dbg !44 + %159 = fcmp oeq float %158, 0.000000e+00, !dbg !45 + %160 = tail call float @llvm.nvvm.div.full(float %90, float %158), !dbg !46 + %161 = select i1 %159, float 0.000000e+00, float %160, !dbg !47 + %162 = fmul float %161, %157, !dbg !48 + %163 = fadd float %150, %162, !dbg !49 + %164 = fadd float %136, %156, !dbg !50 + %165 = fmul float %157, %157, !dbg !51 + %166 = fmul float %145, %165, !dbg !52 + %167 = fmul float %161, %166, !dbg !53 + %168 = fadd float %164, %167, !dbg !54 + %169 = fsub float %135, %163, !dbg !41 + %170 = select i1 %12, float 8.000000e+00, float 0.000000e+00, !dbg !44 + %171 = fcmp oeq float %170, 0.000000e+00, !dbg !45 + %172 = tail call float @llvm.nvvm.div.full(float %91, float %170), !dbg !46 + %173 = select i1 %171, float 0.000000e+00, float %172, !dbg !47 + %174 = fmul float %173, %169, !dbg !48 + %175 = fadd float %163, %174, !dbg !49 + %176 = fadd float %137, %168, !dbg !50 + %177 = fmul float %169, %169, !dbg !51 + %178 = fmul float %158, %177, !dbg !52 + %179 = fmul float %173, %178, !dbg !53 + %180 = fadd float %176, %179, !dbg !54 + %181 = bitcast float %175 to i32, !dbg !42 + %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 16, i32 31), !dbg !42 + %183 = bitcast i32 %182 to float, !dbg !42 + %184 = bitcast float %180 to i32, !dbg !42 + %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 16, i32 31), !dbg !42 + %186 = bitcast i32 %185 to float, !dbg !42 + %187 = bitcast float %170 to i32, !dbg !42 + %188 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %187, i32 16, i32 31), !dbg !42 + %189 = bitcast i32 %188 to float, !dbg !42 + %190 = fsub float %183, %175, !dbg !41 + %191 = fadd float %170, %189, !dbg !44 + %192 = fcmp oeq float %191, 0.000000e+00, !dbg !45 + %193 = tail call float @llvm.nvvm.div.full(float %189, float %191), !dbg !46 + %194 = select i1 %192, float 0.000000e+00, float %193, !dbg !47 + %195 = fmul float %194, %190, !dbg !48 + %196 = fadd float %175, %195, !dbg !49 + %197 = fadd float %180, %186, !dbg !50 + %198 = fmul float %190, %190, !dbg !51 + %199 = fmul float %170, %198, !dbg !52 + %200 = fmul float %194, %199, !dbg !53 + %201 = fadd float %197, %200, !dbg !54 + %202 = bitcast float %196 to i32, !dbg !42 + %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 8, i32 31), !dbg !42 + %204 = bitcast i32 %203 to float, !dbg !42 + %205 = bitcast float %201 to i32, !dbg !42 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 8, i32 31), !dbg !42 + %207 = bitcast i32 %206 to float, !dbg !42 + %208 = bitcast float %191 to i32, !dbg !42 + %209 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %208, i32 8, i32 31), !dbg !42 + %210 = bitcast i32 %209 to float, !dbg !42 + %211 = fsub float %204, %196, !dbg !41 + %212 = fadd float %191, %210, !dbg !44 + %213 = fcmp oeq float %212, 0.000000e+00, !dbg !45 + %214 = tail call float @llvm.nvvm.div.full(float %210, float %212), !dbg !46 + %215 = select i1 %213, float 0.000000e+00, float %214, !dbg !47 + %216 = fmul float %211, %215, !dbg !48 + %217 = fadd float %196, %216, !dbg !49 + %218 = fadd float %201, %207, !dbg !50 + %219 = fmul float %211, %211, !dbg !51 + %220 = fmul float %191, %219, !dbg !52 + %221 = fmul float %215, %220, !dbg !53 + %222 = fadd float %218, %221, !dbg !54 + %223 = bitcast float %217 to i32, !dbg !42 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 4, i32 31), !dbg !42 + %225 = bitcast i32 %224 to float, !dbg !42 + %226 = bitcast float %222 to i32, !dbg !42 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 4, i32 31), !dbg !42 + %228 = bitcast i32 %227 to float, !dbg !42 + %229 = bitcast float %212 to i32, !dbg !42 + %230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 4, i32 31), !dbg !42 + %231 = bitcast i32 %230 to float, !dbg !42 + %232 = fsub float %225, %217, !dbg !41 + %233 = fadd float %212, %231, !dbg !44 + %234 = fcmp oeq float %233, 0.000000e+00, !dbg !45 + %235 = tail call float @llvm.nvvm.div.full(float %231, float %233), !dbg !46 + %236 = select i1 %234, float 0.000000e+00, float %235, !dbg !47 + %237 = fmul float %232, %236, !dbg !48 + %238 = fadd float %217, %237, !dbg !49 + %239 = fadd float %222, %228, !dbg !50 + %240 = fmul float %232, %232, !dbg !51 + %241 = fmul float %212, %240, !dbg !52 + %242 = fmul float %236, %241, !dbg !53 + %243 = fadd float %239, %242, !dbg !54 + %244 = bitcast float %238 to i32, !dbg !42 + %245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 2, i32 31), !dbg !42 + %246 = bitcast i32 %245 to float, !dbg !42 + %247 = bitcast float %243 to i32, !dbg !42 + %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 2, i32 31), !dbg !42 + %249 = bitcast i32 %248 to float, !dbg !42 + %250 = bitcast float %233 to i32, !dbg !42 + %251 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %250, i32 2, i32 31), !dbg !42 + %252 = bitcast i32 %251 to float, !dbg !42 + %253 = fsub float %246, %238, !dbg !41 + %254 = fadd float %233, %252, !dbg !44 + %255 = fcmp oeq float %254, 0.000000e+00, !dbg !45 + %256 = tail call float @llvm.nvvm.div.full(float %252, float %254), !dbg !46 + %257 = select i1 %255, float 0.000000e+00, float %256, !dbg !47 + %258 = fmul float %253, %257, !dbg !48 + %259 = fadd float %238, %258, !dbg !49 + %260 = fadd float %243, %249, !dbg !50 + %261 = fmul float %253, %253, !dbg !51 + %262 = fmul float %233, %261, !dbg !52 + %263 = fmul float %257, %262, !dbg !53 + %264 = fadd float %260, %263, !dbg !54 + %265 = bitcast float %259 to i32, !dbg !42 + %266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 1, i32 31), !dbg !42 + %267 = bitcast i32 %266 to float, !dbg !42 + %268 = bitcast float %264 to i32, !dbg !42 + %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 1, i32 31), !dbg !42 + %270 = bitcast i32 %269 to float, !dbg !42 + %271 = bitcast float %254 to i32, !dbg !42 + %272 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %271, i32 1, i32 31), !dbg !42 + %273 = bitcast i32 %272 to float, !dbg !42 + %274 = fsub float %267, %259, !dbg !41 + %275 = fadd float %254, %273, !dbg !44 + %276 = fcmp oeq float %275, 0.000000e+00, !dbg !45 + %277 = tail call float @llvm.nvvm.div.full(float %273, float %275), !dbg !46 + %278 = select i1 %276, float 0.000000e+00, float %277, !dbg !47 + %279 = fmul float %274, %278, !dbg !48 + %280 = fadd float %259, %279, !dbg !49 + %281 = fadd float %264, %270, !dbg !50 + %282 = fmul float %274, %274, !dbg !51 + %283 = fmul float %254, %282, !dbg !52 + %284 = fmul float %278, %283, !dbg !53 + %285 = fadd float %281, %284, !dbg !54 + %286 = icmp eq i32 %142, 0, !dbg !42 + %287 = getelementptr float, ptr addrspace(3) @global_smem, i32 %143, !dbg !42 + %288 = bitcast float %280 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %287, <1 x i32> %288, i1 %286) #6, !dbg !42 + %289 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %143, !dbg !42 + %290 = bitcast float %285 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %289, <1 x i32> %290, i1 %286) #6, !dbg !42 + %291 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %143, !dbg !42 + %292 = bitcast float %275 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %291, <1 x i32> %292, i1 %286) #6, !dbg !42 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42 + %293 = icmp samesign ult i32 %141, 16, !dbg !42 + %294 = getelementptr float, ptr addrspace(3) @global_smem, i32 %141, !dbg !42 + %295 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %294, i1 %293) #6, !dbg !42 + %296 = bitcast i32 %295 to float, !dbg !42 + %297 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %141, !dbg !42 + %298 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %297, i1 %293) #6, !dbg !42 + %299 = bitcast i32 %298 to float, !dbg !42 + %300 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %141, !dbg !42 + %301 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %300, i1 %293) #6, !dbg !42 + %302 = bitcast i32 %301 to float, !dbg !42 + %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %295, i32 8, i32 31), !dbg !42 + %304 = bitcast i32 %303 to float, !dbg !42 + %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 8, i32 31), !dbg !42 + %306 = bitcast i32 %305 to float, !dbg !42 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %301, i32 8, i32 31), !dbg !42 + %308 = bitcast i32 %307 to float, !dbg !42 + %309 = fsub float %304, %296, !dbg !41 + %310 = fadd float %302, %308, !dbg !44 + %311 = fcmp oeq float %310, 0.000000e+00, !dbg !45 + %312 = tail call float @llvm.nvvm.div.full(float %308, float %310), !dbg !46 + %313 = select i1 %311, float 0.000000e+00, float %312, !dbg !47 + %314 = fmul float %309, %313, !dbg !48 + %315 = fadd float %314, %296, !dbg !49 + %316 = fadd float %299, %306, !dbg !50 + %317 = fmul float %309, %309, !dbg !51 + %318 = fmul float %317, %302, !dbg !52 + %319 = fmul float %318, %313, !dbg !53 + %320 = fadd float %316, %319, !dbg !54 + %321 = bitcast float %315 to i32, !dbg !42 + %322 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 4, i32 31), !dbg !42 + %323 = bitcast i32 %322 to float, !dbg !42 + %324 = bitcast float %320 to i32, !dbg !42 + %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 4, i32 31), !dbg !42 + %326 = bitcast i32 %325 to float, !dbg !42 + %327 = bitcast float %310 to i32, !dbg !42 + %328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %327, i32 4, i32 31), !dbg !42 + %329 = bitcast i32 %328 to float, !dbg !42 + %330 = fsub float %323, %315, !dbg !41 + %331 = fadd float %310, %329, !dbg !44 + %332 = fcmp oeq float %331, 0.000000e+00, !dbg !45 + %333 = tail call float @llvm.nvvm.div.full(float %329, float %331), !dbg !46 + %334 = select i1 %332, float 0.000000e+00, float %333, !dbg !47 + %335 = fmul float %330, %334, !dbg !48 + %336 = fadd float %315, %335, !dbg !49 + %337 = fadd float %320, %326, !dbg !50 + %338 = fmul float %330, %330, !dbg !51 + %339 = fmul float %310, %338, !dbg !52 + %340 = fmul float %334, %339, !dbg !53 + %341 = fadd float %337, %340, !dbg !54 + %342 = bitcast float %336 to i32, !dbg !42 + %343 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %342, i32 2, i32 31), !dbg !42 + %344 = bitcast i32 %343 to float, !dbg !42 + %345 = bitcast float %341 to i32, !dbg !42 + %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 2, i32 31), !dbg !42 + %347 = bitcast i32 %346 to float, !dbg !42 + %348 = bitcast float %331 to i32, !dbg !42 + %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 2, i32 31), !dbg !42 + %350 = bitcast i32 %349 to float, !dbg !42 + %351 = fsub float %344, %336, !dbg !41 + %352 = fadd float %331, %350, !dbg !44 + %353 = fcmp oeq float %352, 0.000000e+00, !dbg !45 + %354 = tail call float @llvm.nvvm.div.full(float %350, float %352), !dbg !46 + %355 = select i1 %353, float 0.000000e+00, float %354, !dbg !47 + %356 = fmul float %351, %355, !dbg !48 + %357 = fadd float %336, %356, !dbg !49 + %358 = fadd float %341, %347, !dbg !50 + %359 = fmul float %351, %351, !dbg !51 + %360 = fmul float %331, %359, !dbg !52 + %361 = fmul float %355, %360, !dbg !53 + %362 = fadd float %358, %361, !dbg !54 + %363 = bitcast float %357 to i32, !dbg !42 + %364 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %363, i32 1, i32 31), !dbg !42 + %365 = bitcast i32 %364 to float, !dbg !42 + %366 = bitcast float %362 to i32, !dbg !42 + %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 1, i32 31), !dbg !42 + %368 = bitcast i32 %367 to float, !dbg !42 + %369 = bitcast float %352 to i32, !dbg !42 + %370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %369, i32 1, i32 31), !dbg !42 + %371 = bitcast i32 %370 to float, !dbg !42 + %372 = fsub float %365, %357, !dbg !41 + %373 = fadd float %352, %371, !dbg !44 + %374 = fcmp oeq float %373, 0.000000e+00, !dbg !45 + %375 = tail call float @llvm.nvvm.div.full(float %371, float %373), !dbg !46 + %376 = select i1 %374, float 0.000000e+00, float %375, !dbg !47 + %377 = fmul float %372, %376, !dbg !48 + %378 = fadd float %357, %377, !dbg !49 + %379 = fadd float %362, %368, !dbg !50 + %380 = fmul float %372, %372, !dbg !51 + %381 = fmul float %352, %380, !dbg !52 + %382 = fmul float %376, %381, !dbg !53 + %383 = fadd float %379, %382, !dbg !54 + %384 = and i32 %13, 15, !dbg !42 + %385 = icmp eq i32 %384, 0, !dbg !42 + %386 = and i1 %293, %385, !dbg !42 + %387 = bitcast float %378 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %294, <1 x i32> %387, i1 %386) #6, !dbg !42 + %388 = bitcast float %383 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %297, <1 x i32> %388, i1 %386) #6, !dbg !42 + %389 = bitcast float %373 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %300, <1 x i32> %389, i1 %386) #6, !dbg !42 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42 + %390 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !42 + %391 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !42 + %392 = tail call float @llvm.nvvm.div.full(float %391, float 4.096000e+03), !dbg !55 + %393 = fadd float %392, 0x3EB0C6F7A0000000, !dbg !56 + %394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57 + %395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57 + %396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57 + %397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57 + %.not.i15 = icmp eq i32 %397, 0, !dbg !57 + br i1 %.not.i15, label %400, label %398, !dbg !57 + +398: ; preds = %__nv_rsqrtf.exit + %399 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %393), !dbg !57 + br label %__nv_rsqrtf.exit17, !dbg !57 + +400: ; preds = %__nv_rsqrtf.exit + %401 = tail call float @llvm.nvvm.rsqrt.approx.f(float %393), !dbg !57 + br label %__nv_rsqrtf.exit17, !dbg !57 + +__nv_rsqrtf.exit17: ; preds = %398, %400 + %.0.i16 = phi float [ %399, %398 ], [ %401, %400 ], !dbg !57 + %402 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %403 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %41, i64 %402, i1 %12) #6, !dbg !58 + %404 = extractvalue { i32, i32 } %403, 0, !dbg !58 + %405 = bitcast i32 %404 to <2 x bfloat>, !dbg !58 + %406 = extractvalue { i32, i32 } %403, 1, !dbg !58 + %407 = bitcast i32 %406 to <2 x bfloat>, !dbg !58 + %408 = getelementptr bfloat, ptr addrspace(1) %3, i64 %17, !dbg !59 + %409 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !60 + %410 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %408, i64 %409, i1 true) #6, !dbg !60 + %411 = extractvalue { i32, i32 } %410, 0, !dbg !60 + %412 = bitcast i32 %411 to <2 x bfloat>, !dbg !60 + %413 = extractvalue { i32, i32 } %410, 1, !dbg !60 + %414 = bitcast i32 %413 to <2 x bfloat>, !dbg !60 + %415 = getelementptr bfloat, ptr addrspace(1) %4, i64 %17, !dbg !61 + %416 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !62 + %417 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %415, i64 %416, i1 true) #6, !dbg !62 + %418 = extractvalue { i32, i32 } %417, 0, !dbg !62 + %419 = bitcast i32 %418 to <2 x bfloat>, !dbg !62 + %420 = extractvalue { i32, i32 } %417, 1, !dbg !62 + %421 = bitcast i32 %420 to <2 x bfloat>, !dbg !62 + %422 = getelementptr bfloat, ptr addrspace(1) %6, i64 %19, !dbg !63 + %423 = fpext <2 x bfloat> %405 to <2 x float>, !dbg !64 + %424 = fpext <2 x bfloat> %412 to <2 x float>, !dbg !65 + %425 = fpext <2 x bfloat> %419 to <2 x float>, !dbg !66 + %426 = insertelement <2 x float> poison, float %390, i64 0, !dbg !67 + %427 = shufflevector <2 x float> %426, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !67 + %428 = fsub <2 x float> %423, %427, !dbg !67 + %429 = insertelement <2 x float> poison, float %.0.i16, i64 0, !dbg !68 + %430 = shufflevector <2 x float> %429, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !68 + %431 = fmul <2 x float> %430, %428, !dbg !68 + %432 = fadd <2 x float> %424, splat (float 1.000000e+00), !dbg !69 + %433 = fmul <2 x float> %431, %432, !dbg !70 + %434 = fadd <2 x float> %433, %425, !dbg !71 + %435 = fptrunc <2 x float> %434 to <2 x bfloat>, !dbg !72 + %436 = fpext <2 x bfloat> %407 to <2 x float>, !dbg !64 + %437 = fpext <2 x bfloat> %414 to <2 x float>, !dbg !65 + %438 = fpext <2 x bfloat> %421 to <2 x float>, !dbg !66 + %439 = fsub <2 x float> %436, %427, !dbg !67 + %440 = fmul <2 x float> %430, %439, !dbg !68 + %441 = fadd <2 x float> %437, splat (float 1.000000e+00), !dbg !69 + %442 = fmul <2 x float> %440, %441, !dbg !70 + %443 = fadd <2 x float> %442, %438, !dbg !71 + %444 = fptrunc <2 x float> %443 to <2 x bfloat>, !dbg !72 + %445 = bitcast <2 x bfloat> %435 to i32, !dbg !72 + %446 = bitcast <2 x bfloat> %444 to i32, !dbg !72 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %445, i32 %446, ptr addrspace(1) %422, i1 %12) #6, !dbg !72 + %447 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %448 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %92, i64 %447, i1 %12) #6, !dbg !58 + %449 = extractvalue { i32, i32 } %448, 0, !dbg !58 + %450 = bitcast i32 %449 to <2 x bfloat>, !dbg !58 + %451 = extractvalue { i32, i32 } %448, 1, !dbg !58 + %452 = bitcast i32 %451 to <2 x bfloat>, !dbg !58 + %453 = getelementptr bfloat, ptr addrspace(1) %3, i64 %64, !dbg !59 + %454 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !60 + %455 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %453, i64 %454, i1 true) #6, !dbg !60 + %456 = extractvalue { i32, i32 } %455, 0, !dbg !60 + %457 = bitcast i32 %456 to <2 x bfloat>, !dbg !60 + %458 = extractvalue { i32, i32 } %455, 1, !dbg !60 + %459 = bitcast i32 %458 to <2 x bfloat>, !dbg !60 + %460 = getelementptr bfloat, ptr addrspace(1) %4, i64 %64, !dbg !61 + %461 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !62 + %462 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %460, i64 %461, i1 true) #6, !dbg !62 + %463 = extractvalue { i32, i32 } %462, 0, !dbg !62 + %464 = bitcast i32 %463 to <2 x bfloat>, !dbg !62 + %465 = extractvalue { i32, i32 } %462, 1, !dbg !62 + %466 = bitcast i32 %465 to <2 x bfloat>, !dbg !62 + %467 = getelementptr bfloat, ptr addrspace(1) %6, i64 %65, !dbg !63 + %468 = fpext <2 x bfloat> %450 to <2 x float>, !dbg !64 + %469 = fpext <2 x bfloat> %457 to <2 x float>, !dbg !65 + %470 = fpext <2 x bfloat> %464 to <2 x float>, !dbg !66 + %471 = fsub <2 x float> %468, %427, !dbg !67 + %472 = fmul <2 x float> %430, %471, !dbg !68 + %473 = fadd <2 x float> %469, splat (float 1.000000e+00), !dbg !69 + %474 = fmul <2 x float> %472, %473, !dbg !70 + %475 = fadd <2 x float> %474, %470, !dbg !71 + %476 = fptrunc <2 x float> %475 to <2 x bfloat>, !dbg !72 + %477 = fpext <2 x bfloat> %452 to <2 x float>, !dbg !64 + %478 = fpext <2 x bfloat> %459 to <2 x float>, !dbg !65 + %479 = fpext <2 x bfloat> %466 to <2 x float>, !dbg !66 + %480 = fsub <2 x float> %477, %427, !dbg !67 + %481 = fmul <2 x float> %430, %480, !dbg !68 + %482 = fadd <2 x float> %478, splat (float 1.000000e+00), !dbg !69 + %483 = fmul <2 x float> %481, %482, !dbg !70 + %484 = fadd <2 x float> %483, %479, !dbg !71 + %485 = fptrunc <2 x float> %484 to <2 x bfloat>, !dbg !72 + %486 = bitcast <2 x bfloat> %476 to i32, !dbg !72 + %487 = bitcast <2 x bfloat> %485 to i32, !dbg !72 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %486, i32 %487, ptr addrspace(1) %467, i1 %12) #6, !dbg !72 + ret void, !dbg !73 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 25, column: 21, scope: !5) +!10 = !DILocation(line: 26, column: 37, scope: !5) +!11 = !DILocation(line: 38, column: 46, scope: !5) +!12 = !DILocation(line: 32, column: 43, scope: !5) +!13 = !DILocation(line: 38, column: 41, scope: !5) +!14 = !DILocation(line: 38, column: 34, scope: !5) +!15 = !DILocation(line: 38, column: 51, scope: !5) +!16 = !DILocation(line: 39, column: 34, scope: !5) +!17 = !DILocation(line: 39, column: 41, scope: !5) +!18 = !DILocation(line: 40, column: 34, scope: !5) +!19 = !DILocation(line: 40, column: 51, scope: !5) +!20 = !DILocation(line: 51, column: 29, scope: !5) +!21 = !DILocation(line: 39, column: 94, scope: !5) +!22 = !DILocation(line: 40, column: 113, scope: !5) +!23 = !DILocation(line: 41, column: 22, scope: !5) +!24 = !DILocation(line: 38, column: 113, scope: !5) +!25 = !DILocation(line: 42, column: 22, scope: !5) +!26 = !DILocation(line: 48, column: 62, scope: !5) +!27 = !DILocation(line: 51, column: 52, scope: !5) +!28 = !DILocation(line: 33, column: 31, scope: !5) +!29 = !DILocation(line: 50, column: 66, scope: !5) +!30 = !DILocation(line: 225, column: 39, scope: !31, inlinedAt: !33) +!31 = distinct !DILexicalBlockFile(scope: !5, file: !32, discriminator: 0) +!32 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime") +!33 = !DILocation(line: 46, column: 51, scope: !34) +!34 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!35 = !DILocation(line: 222, column: 24, scope: !31, inlinedAt: !33) +!36 = !DILocation(line: 224, column: 34, scope: !31, inlinedAt: !33) +!37 = !DILocation(line: 224, column: 26, scope: !31, inlinedAt: !33) +!38 = !DILocation(line: 225, column: 31, scope: !31, inlinedAt: !33) +!39 = !DILocation(line: 225, column: 22, scope: !31, inlinedAt: !33) +!40 = !DILocation(line: 49, column: 58, scope: !5) +!41 = !DILocation(line: 231, column: 21, scope: !31, inlinedAt: !42) +!42 = !DILocation(line: 243, column: 46, scope: !31, inlinedAt: !43) +!43 = !DILocation(line: 52, column: 80, scope: !34) +!44 = !DILocation(line: 232, column: 28, scope: !31, inlinedAt: !42) +!45 = !DILocation(line: 233, column: 39, scope: !31, inlinedAt: !42) +!46 = !DILocation(line: 233, column: 60, scope: !31, inlinedAt: !42) +!47 = !DILocation(line: 233, column: 49, scope: !31, inlinedAt: !42) +!48 = !DILocation(line: 235, column: 25, scope: !31, inlinedAt: !42) +!49 = !DILocation(line: 235, column: 17, scope: !31, inlinedAt: !42) +!50 = !DILocation(line: 236, column: 15, scope: !31, inlinedAt: !42) +!51 = !DILocation(line: 236, column: 30, scope: !31, inlinedAt: !42) +!52 = !DILocation(line: 236, column: 38, scope: !31, inlinedAt: !42) +!53 = !DILocation(line: 236, column: 49, scope: !31, inlinedAt: !42) +!54 = !DILocation(line: 236, column: 22, scope: !31, inlinedAt: !42) +!55 = !DILocation(line: 68, column: 25, scope: !5) +!56 = !DILocation(line: 70, column: 24, scope: !5) +!57 = !DILocation(line: 71, column: 32, scope: !5) +!58 = !DILocation(line: 62, column: 53, scope: !5) +!59 = !DILocation(line: 63, column: 35, scope: !5) +!60 = !DILocation(line: 63, column: 42, scope: !5) +!61 = !DILocation(line: 64, column: 35, scope: !5) +!62 = !DILocation(line: 64, column: 42, scope: !5) +!63 = !DILocation(line: 78, column: 29, scope: !5) +!64 = !DILocation(line: 62, column: 115, scope: !5) +!65 = !DILocation(line: 63, column: 95, scope: !5) +!66 = !DILocation(line: 64, column: 95, scope: !5) +!67 = !DILocation(line: 66, column: 24, scope: !5) +!68 = !DILocation(line: 72, column: 24, scope: !5) +!69 = !DILocation(line: 75, column: 24, scope: !5) +!70 = !DILocation(line: 76, column: 24, scope: !5) +!71 = !DILocation(line: 77, column: 24, scope: !5) +!72 = !DILocation(line: 78, column: 53, scope: !5) +!73 = !DILocation(line: 56, column: 4, scope: !5) diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..cde18d5d05d4a49655a043c6c8748dac11e2c17a --- /dev/null +++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ptx @@ -0,0 +1,1191 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused_add_mul_native_layer_norm_0 +.visible .entry triton_red_fused_add_mul_native_layer_norm_0( + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_7, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_10 +) +.reqntid 512 +{ + .reg .pred %p<19>; + .reg .b16 %rs<49>; + .reg .b32 %r<317>; + .reg .b64 %rd<39>; + .loc 1 18 0 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd27, [triton_red_fused_add_mul_native_layer_norm_0_param_0]; + ld.param.b64 %rd28, [triton_red_fused_add_mul_native_layer_norm_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:23:28 + mov.u32 %r49, %ctaid.x; + .loc 1 25 21 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:25:21 + setp.lt.u32 %p1, %r49, 2048; + ld.param.b64 %rd29, [triton_red_fused_add_mul_native_layer_norm_0_param_2]; + ld.param.b64 %rd30, [triton_red_fused_add_mul_native_layer_norm_0_param_3]; + .loc 1 26 37 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:26:37 + mov.u32 %r50, %tid.x; + shl.b32 %r51, %r50, 2; + ld.param.b64 %rd31, [triton_red_fused_add_mul_native_layer_norm_0_param_4]; + and.b32 %r52, %r51, 2044; + ld.param.b64 %rd32, [triton_red_fused_add_mul_native_layer_norm_0_param_5]; + .loc 1 38 46 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:46 + shl.b32 %r53, %r49, 12; + ld.param.b64 %rd33, [triton_red_fused_add_mul_native_layer_norm_0_param_6]; + .loc 1 32 43 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:32:43 + cvt.u64.u32 %rd34, %r52; + cvt.s64.s32 %rd35, %r53; + .loc 1 38 41 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:41 + or.b64 %rd36, %rd34, %rd35; + .loc 1 38 34 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:34 + shl.b64 %rd37, %rd36, 1; + add.s64 %rd1, %rd27, %rd37; + .loc 1 38 51 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:51 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + .loc 1 39 34 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:34 + mul.wide.u32 %rd38, %r52, 2; + add.s64 %rd3, %rd28, %rd38; + .loc 1 39 41 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:41 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + mov.pred %p2, -1; + // begin inline asm + mov.u32 %r4, %r3; + mov.u32 %r5, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 40 34 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:34 + add.s64 %rd5, %rd29, %rd37; + .loc 1 40 51 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:51 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r3; + mov.u32 %r7, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r6, %r7 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 51 29 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:29 + add.s64 %rd7, %rd32, %rd37; + .loc 1 39 94 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94 + mov.b32 {%rs1, %rs2}, %r4; + cvt.f32.bf16 %r54, %rs1; + cvt.f32.bf16 %r55, %rs2; + .loc 1 40 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113 + mov.b32 {%rs3, %rs4}, %r6; + cvt.f32.bf16 %r56, %rs3; + cvt.f32.bf16 %r57, %rs4; + .loc 1 38 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113 + mov.b32 {%rs5, %rs6}, %r1; + cvt.f32.bf16 %r58, %rs5; + cvt.f32.bf16 %r59, %rs6; + .loc 1 42 22 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22 + fma.rn.f32 %r60, %r55, %r57, %r59; + fma.rn.f32 %r61, %r54, %r56, %r58; + .loc 1 48 62 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62 + selp.f32 %r62, %r61, 0f00000000, %p1; + selp.f32 %r63, %r60, 0f00000000, %p1; + .loc 1 51 52 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52 + cvt.rn.bf16x2.f32 %r8, %r60, %r61; + .loc 1 39 94 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94 + mov.b32 {%rs7, %rs8}, %r5; + cvt.f32.bf16 %r64, %rs7; + cvt.f32.bf16 %r65, %rs8; + .loc 1 40 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113 + mov.b32 {%rs9, %rs10}, %r7; + cvt.f32.bf16 %r66, %rs9; + cvt.f32.bf16 %r67, %rs10; + .loc 1 38 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113 + mov.b32 {%rs11, %rs12}, %r2; + cvt.f32.bf16 %r68, %rs11; + cvt.f32.bf16 %r69, %rs12; + .loc 1 42 22 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22 + fma.rn.f32 %r70, %r65, %r67, %r69; + fma.rn.f32 %r71, %r64, %r66, %r68; + .loc 1 48 62 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62 + selp.f32 %r72, %r71, 0f00000000, %p1; + selp.f32 %r73, %r70, 0f00000000, %p1; + .loc 1 51 52 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52 + cvt.rn.bf16x2.f32 %r9, %r70, %r71; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd7 + 0 ], { %r8, %r9 }; + // end inline asm + .loc 1 38 34 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:34 + add.s64 %rd8, %rd1, 4096; + .loc 1 38 51 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:51 + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r3; + mov.u32 %r11, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r10, %r11 }, [ %rd8 + 0 ], %rd9; + // end inline asm + .loc 1 39 34 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:34 + add.s64 %rd10, %rd3, 4096; + .loc 1 39 41 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:41 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r12, %r3; + mov.u32 %r13, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r12, %r13 }, [ %rd10 + 0 ], %rd11; + // end inline asm + .loc 1 40 34 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:34 + add.s64 %rd12, %rd5, 4096; + .loc 1 40 51 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:51 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd13, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r14, %r3; + mov.u32 %r15, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r14, %r15 }, [ %rd12 + 0 ], %rd13; + // end inline asm + .loc 1 50 66 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:50:66 + selp.f32 %r74, 0f40000000, 0f3F800000, %p1; + selp.f32 %r75, 0f40000000, 0f00000000, %p1; + .loc 1 51 29 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:29 + add.s64 %rd14, %rd7, 4096; + .loc 1 38 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113 + mov.b32 {%rs13, %rs14}, %r10; + cvt.f32.bf16 %r76, %rs13; + cvt.f32.bf16 %r77, %rs14; + .loc 1 39 94 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94 + mov.b32 {%rs15, %rs16}, %r12; + cvt.f32.bf16 %r78, %rs15; + cvt.f32.bf16 %r79, %rs16; + .loc 1 40 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113 + mov.b32 {%rs17, %rs18}, %r14; + cvt.f32.bf16 %r80, %rs17; + cvt.f32.bf16 %r81, %rs18; + .loc 1 42 22 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22 + fma.rn.f32 %r82, %r79, %r81, %r77; + fma.rn.f32 %r83, %r78, %r80, %r76; +$L__tmp1: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + sub.f32 %r84, %r83, %r62; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + div.full.f32 %r85, %r84, %r74; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + add.f32 %r86, %r62, %r85; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + sub.f32 %r87, %r83, %r86; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + fma.rn.f32 %r88, %r84, %r87, 0f00000000; + .loc 2 222 24 // triton_helpers.py:222:24 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + sub.f32 %r89, %r82, %r63; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + div.full.f32 %r90, %r89, %r74; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + add.f32 %r91, %r63, %r90; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + sub.f32 %r92, %r82, %r91; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + fma.rn.f32 %r93, %r89, %r92, 0f00000000; +$L__tmp2: + .loc 1 48 62 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62 + selp.f32 %r94, %r86, 0f00000000, %p1; + selp.f32 %r95, %r91, 0f00000000, %p1; + .loc 1 51 52 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52 + cvt.rn.bf16x2.f32 %r16, %r82, %r83; + .loc 1 38 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113 + mov.b32 {%rs19, %rs20}, %r11; + cvt.f32.bf16 %r96, %rs19; + cvt.f32.bf16 %r97, %rs20; + .loc 1 39 94 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94 + mov.b32 {%rs21, %rs22}, %r13; + cvt.f32.bf16 %r98, %rs21; + cvt.f32.bf16 %r99, %rs22; + .loc 1 40 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113 + mov.b32 {%rs23, %rs24}, %r15; + cvt.f32.bf16 %r100, %rs23; + cvt.f32.bf16 %r101, %rs24; + .loc 1 42 22 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22 + fma.rn.f32 %r102, %r99, %r101, %r97; + fma.rn.f32 %r103, %r98, %r100, %r96; +$L__tmp3: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + sub.f32 %r104, %r103, %r72; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + div.full.f32 %r105, %r104, %r74; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + add.f32 %r106, %r72, %r105; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + sub.f32 %r107, %r103, %r106; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + fma.rn.f32 %r108, %r104, %r107, 0f00000000; + .loc 2 222 24 // triton_helpers.py:222:24 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + sub.f32 %r109, %r102, %r73; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + div.full.f32 %r110, %r109, %r74; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + add.f32 %r111, %r73, %r110; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + sub.f32 %r112, %r102, %r111; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ] + fma.rn.f32 %r113, %r109, %r112, 0f00000000; +$L__tmp4: + .loc 1 48 62 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62 + selp.f32 %r114, %r106, 0f00000000, %p1; + selp.f32 %r115, %r111, 0f00000000, %p1; + .loc 1 49 58 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:49:58 + selp.f32 %r116, %r108, 0f00000000, %p1; + selp.f32 %r117, %r113, 0f00000000, %p1; + .loc 1 51 52 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52 + cvt.rn.bf16x2.f32 %r17, %r102, %r103; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd14 + 0 ], { %r16, %r17 }; + // end inline asm + .loc 1 26 37 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:26:37 + and.b32 %r118, %r50, 511; + and.b32 %r119, %r50, 31; +$L__tmp5: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r120, %r95, %r94; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r121, 0f40800000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p6, %r121, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r122, %r75, %r121; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r123, 0f00000000, %r122, %p6; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r124, %r120, %r123, %r94; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r125, %r88, %r93; + selp.f32 %r126, %r125, 0f00000000, %p1; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r127, %r120, %r120; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r128, %r127, %r75; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r129, %r128, %r123, %r126; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r130, %r114, %r124; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r131, 0f40C00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p7, %r131, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r132, %r75, %r131; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r133, 0f00000000, %r132, %p7; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r134, %r133, %r130, %r124; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r135, %r116, %r129; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r136, %r130, %r130; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r137, %r121, %r136; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r138, %r133, %r137, %r135; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r139, %r115, %r134; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r140, 0f41000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p8, %r140, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r141, %r75, %r140; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r142, 0f00000000, %r141, %p8; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r143, %r142, %r139, %r134; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r144, %r117, %r138; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r145, %r139, %r139; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r146, %r131, %r145; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r147, %r142, %r146, %r144; +$L__tmp6: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r148, %r143, 16, 31, -1; + shfl.sync.bfly.b32 %r149, %r147, 16, 31, -1; + shfl.sync.bfly.b32 %r150, %r140, 16, 31, -1; +$L__tmp7: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r151, %r148, %r143; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r152, %r140, %r150; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p9, %r152, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r153, %r150, %r152; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r154, 0f00000000, %r153, %p9; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r155, %r154, %r151, %r143; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r156, %r147, %r149; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r157, %r151, %r151; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r158, %r140, %r157; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r159, %r154, %r158, %r156; +$L__tmp8: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r160, %r155, 8, 31, -1; + shfl.sync.bfly.b32 %r161, %r159, 8, 31, -1; + shfl.sync.bfly.b32 %r162, %r152, 8, 31, -1; +$L__tmp9: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r163, %r160, %r155; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r164, %r152, %r162; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p10, %r164, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r165, %r162, %r164; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r166, 0f00000000, %r165, %p10; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r167, %r163, %r166, %r155; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r168, %r159, %r161; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r169, %r163, %r163; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r170, %r152, %r169; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r171, %r166, %r170, %r168; +$L__tmp10: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r172, %r167, 4, 31, -1; + shfl.sync.bfly.b32 %r173, %r171, 4, 31, -1; + shfl.sync.bfly.b32 %r174, %r164, 4, 31, -1; +$L__tmp11: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r175, %r172, %r167; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r176, %r164, %r174; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p11, %r176, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r177, %r174, %r176; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r178, 0f00000000, %r177, %p11; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r179, %r175, %r178, %r167; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r180, %r171, %r173; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r181, %r175, %r175; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r182, %r164, %r181; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r183, %r178, %r182, %r180; +$L__tmp12: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r184, %r179, 2, 31, -1; + shfl.sync.bfly.b32 %r185, %r183, 2, 31, -1; + shfl.sync.bfly.b32 %r186, %r176, 2, 31, -1; +$L__tmp13: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r187, %r184, %r179; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r188, %r176, %r186; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p12, %r188, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r189, %r186, %r188; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r190, 0f00000000, %r189, %p12; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r191, %r187, %r190, %r179; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r192, %r183, %r185; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r193, %r187, %r187; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r194, %r176, %r193; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r195, %r190, %r194, %r192; +$L__tmp14: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r196, %r191, 1, 31, -1; + shfl.sync.bfly.b32 %r197, %r195, 1, 31, -1; + shfl.sync.bfly.b32 %r198, %r188, 1, 31, -1; +$L__tmp15: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r199, %r196, %r191; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r23, %r188, %r198; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p13, %r23, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r200, %r198, %r23; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r201, 0f00000000, %r200, %p13; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r19, %r199, %r201, %r191; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r202, %r195, %r197; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r203, %r199, %r199; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r204, %r188, %r203; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r21, %r201, %r204, %r202; +$L__tmp16: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + setp.eq.b32 %p3, %r119, 0; + shr.u32 %r205, %r50, 3; + and.b32 %r206, %r205, 60; + mov.b32 %r207, global_smem; + add.s32 %r18, %r207, %r206; + // begin inline asm + @%p3 st.shared.b32 [ %r18 + 0 ], %r19; + // end inline asm + add.s32 %r20, %r18, 64; + // begin inline asm + @%p3 st.shared.b32 [ %r20 + 0 ], %r21; + // end inline asm + add.s32 %r22, %r18, 128; + // begin inline asm + @%p3 st.shared.b32 [ %r22 + 0 ], %r23; + // end inline asm + bar.sync 0; + setp.lt.u32 %p4, %r118, 16; + shl.b32 %r208, %r118, 2; + add.s32 %r25, %r207, %r208; + // begin inline asm + @%p4 ld.shared.b32 %r24, [ %r25 + 0 ]; + // end inline asm + add.s32 %r27, %r25, 64; + // begin inline asm + @%p4 ld.shared.b32 %r26, [ %r27 + 0 ]; + // end inline asm + add.s32 %r29, %r25, 128; + // begin inline asm + @%p4 ld.shared.b32 %r28, [ %r29 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r209, %r24, 8, 31, -1; + shfl.sync.bfly.b32 %r210, %r26, 8, 31, -1; + shfl.sync.bfly.b32 %r211, %r28, 8, 31, -1; +$L__tmp17: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r212, %r209, %r24; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r213, %r28, %r211; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p14, %r213, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r214, %r211, %r213; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r215, 0f00000000, %r214, %p14; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r216, %r212, %r215, %r24; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r217, %r26, %r210; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r218, %r212, %r212; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r219, %r218, %r28; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r220, %r219, %r215, %r217; +$L__tmp18: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r221, %r216, 4, 31, -1; + shfl.sync.bfly.b32 %r222, %r220, 4, 31, -1; + shfl.sync.bfly.b32 %r223, %r213, 4, 31, -1; +$L__tmp19: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r224, %r221, %r216; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r225, %r213, %r223; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p15, %r225, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r226, %r223, %r225; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r227, 0f00000000, %r226, %p15; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r228, %r224, %r227, %r216; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r229, %r220, %r222; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r230, %r224, %r224; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r231, %r213, %r230; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r232, %r227, %r231, %r229; +$L__tmp20: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r233, %r228, 2, 31, -1; + shfl.sync.bfly.b32 %r234, %r232, 2, 31, -1; + shfl.sync.bfly.b32 %r235, %r225, 2, 31, -1; +$L__tmp21: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r236, %r233, %r228; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r237, %r225, %r235; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p16, %r237, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r238, %r235, %r237; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r239, 0f00000000, %r238, %p16; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r240, %r236, %r239, %r228; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r241, %r232, %r234; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r242, %r236, %r236; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r243, %r225, %r242; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r244, %r239, %r243, %r241; +$L__tmp22: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r245, %r240, 1, 31, -1; + shfl.sync.bfly.b32 %r246, %r244, 1, 31, -1; + shfl.sync.bfly.b32 %r247, %r237, 1, 31, -1; +$L__tmp23: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r248, %r245, %r240; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r32, %r237, %r247; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p17, %r32, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r249, %r247, %r32; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r250, 0f00000000, %r249, %p17; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r30, %r248, %r250, %r240; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r251, %r244, %r246; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r252, %r248, %r248; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r253, %r237, %r252; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r31, %r250, %r253, %r251; +$L__tmp24: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + and.b32 %r254, %r50, 15; + setp.eq.b32 %p18, %r254, 0; + and.pred %p5, %p4, %p18; + // begin inline asm + @%p5 st.shared.b32 [ %r25 + 0 ], %r30; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r27 + 0 ], %r31; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r29 + 0 ], %r32; + // end inline asm + bar.sync 0; + ld.shared.b32 %r255, [global_smem]; + ld.shared.b32 %r256, [global_smem+64]; + mov.b32 %r257, 0f45800000; +$L__tmp25: + .loc 1 68 25 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:68:25 + div.full.f32 %r258, %r256, %r257; + .loc 1 70 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:70:24 + add.f32 %r259, %r258, 0f358637BD; + .loc 1 71 32 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:71:32 + rsqrt.approx.ftz.f32 %r260, %r259; + .loc 1 62 53 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:53 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r33, %r3; + mov.u32 %r34, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd7 + 0 ], %rd15; + // end inline asm + .loc 1 63 35 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:35 + add.s64 %rd16, %rd30, %rd38; + .loc 1 63 42 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:42 + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r35, %r3; + mov.u32 %r36, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r35, %r36 }, [ %rd16 + 0 ], %rd17; + // end inline asm + .loc 1 64 35 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:35 + add.s64 %rd18, %rd31, %rd38; + .loc 1 64 42 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:42 + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r37, %r3; + mov.u32 %r38, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r37, %r38 }, [ %rd18 + 0 ], %rd19; + // end inline asm + .loc 1 78 29 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:29 + add.s64 %rd20, %rd33, %rd37; + .loc 1 62 115 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115 + mov.b32 {%rs25, %rs26}, %r33; + cvt.f32.bf16 %r261, %rs26; + cvt.f32.bf16 %r262, %rs25; + .loc 1 63 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95 + mov.b32 {%rs27, %rs28}, %r35; + cvt.f32.bf16 %r263, %rs27; + cvt.f32.bf16 %r264, %rs28; + .loc 1 64 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95 + mov.b32 {%rs29, %rs30}, %r37; + cvt.f32.bf16 %r265, %rs30; + cvt.f32.bf16 %r266, %rs29; + .loc 1 66 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24 + sub.f32 %r267, %r262, %r255; + sub.f32 %r268, %r261, %r255; + .loc 1 72 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24 + mul.f32 %r269, %r260, %r268; + mul.f32 %r270, %r260, %r267; + .loc 1 75 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24 + add.f32 %r271, %r264, 0f3F800000; + add.f32 %r272, %r263, 0f3F800000; + .loc 1 77 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24 + fma.rn.f32 %r273, %r270, %r272, %r266; + fma.rn.f32 %r274, %r269, %r271, %r265; + .loc 1 78 53 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53 + cvt.rn.bf16x2.f32 %r39, %r274, %r273; + .loc 1 62 115 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115 + mov.b32 {%rs31, %rs32}, %r34; + cvt.f32.bf16 %r275, %rs32; + cvt.f32.bf16 %r276, %rs31; + .loc 1 63 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95 + mov.b32 {%rs33, %rs34}, %r36; + cvt.f32.bf16 %r277, %rs33; + cvt.f32.bf16 %r278, %rs34; + .loc 1 64 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95 + mov.b32 {%rs35, %rs36}, %r38; + cvt.f32.bf16 %r279, %rs36; + cvt.f32.bf16 %r280, %rs35; + .loc 1 66 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24 + sub.f32 %r281, %r276, %r255; + sub.f32 %r282, %r275, %r255; + .loc 1 72 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24 + mul.f32 %r283, %r260, %r282; + mul.f32 %r284, %r260, %r281; + .loc 1 75 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24 + add.f32 %r285, %r278, 0f3F800000; + add.f32 %r286, %r277, 0f3F800000; + .loc 1 77 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24 + fma.rn.f32 %r287, %r284, %r286, %r280; + fma.rn.f32 %r288, %r283, %r285, %r279; + .loc 1 78 53 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53 + cvt.rn.bf16x2.f32 %r40, %r288, %r287; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd20 + 0 ], { %r39, %r40 }; + // end inline asm + .loc 1 62 53 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:53 + // begin inline asm + mov.u64 %rd21, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd21, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r41, %r3; + mov.u32 %r42, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r41, %r42 }, [ %rd14 + 0 ], %rd21; + // end inline asm + .loc 1 63 35 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:35 + add.s64 %rd22, %rd16, 4096; + .loc 1 63 42 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:42 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r43, %r3; + mov.u32 %r44, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r43, %r44 }, [ %rd22 + 0 ], %rd23; + // end inline asm + .loc 1 64 35 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:35 + add.s64 %rd24, %rd18, 4096; + .loc 1 64 42 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:42 + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r45, %r3; + mov.u32 %r46, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r45, %r46 }, [ %rd24 + 0 ], %rd25; + // end inline asm + .loc 1 78 29 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:29 + add.s64 %rd26, %rd20, 4096; + .loc 1 62 115 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115 + mov.b32 {%rs37, %rs38}, %r41; + cvt.f32.bf16 %r289, %rs38; + cvt.f32.bf16 %r290, %rs37; + .loc 1 63 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95 + mov.b32 {%rs39, %rs40}, %r43; + cvt.f32.bf16 %r291, %rs39; + cvt.f32.bf16 %r292, %rs40; + .loc 1 64 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95 + mov.b32 {%rs41, %rs42}, %r45; + cvt.f32.bf16 %r293, %rs42; + cvt.f32.bf16 %r294, %rs41; + .loc 1 66 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24 + sub.f32 %r295, %r290, %r255; + sub.f32 %r296, %r289, %r255; + .loc 1 72 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24 + mul.f32 %r297, %r260, %r296; + mul.f32 %r298, %r260, %r295; + .loc 1 75 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24 + add.f32 %r299, %r292, 0f3F800000; + add.f32 %r300, %r291, 0f3F800000; + .loc 1 77 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24 + fma.rn.f32 %r301, %r298, %r300, %r294; + fma.rn.f32 %r302, %r297, %r299, %r293; + .loc 1 78 53 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53 + cvt.rn.bf16x2.f32 %r47, %r302, %r301; + .loc 1 62 115 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115 + mov.b32 {%rs43, %rs44}, %r42; + cvt.f32.bf16 %r303, %rs44; + cvt.f32.bf16 %r304, %rs43; + .loc 1 63 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95 + mov.b32 {%rs45, %rs46}, %r44; + cvt.f32.bf16 %r305, %rs45; + cvt.f32.bf16 %r306, %rs46; + .loc 1 64 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95 + mov.b32 {%rs47, %rs48}, %r46; + cvt.f32.bf16 %r307, %rs48; + cvt.f32.bf16 %r308, %rs47; + .loc 1 66 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24 + sub.f32 %r309, %r304, %r255; + sub.f32 %r310, %r303, %r255; + .loc 1 72 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24 + mul.f32 %r311, %r260, %r310; + mul.f32 %r312, %r260, %r309; + .loc 1 75 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24 + add.f32 %r313, %r306, 0f3F800000; + add.f32 %r314, %r305, 0f3F800000; + .loc 1 77 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24 + fma.rn.f32 %r315, %r312, %r314, %r308; + fma.rn.f32 %r316, %r311, %r313, %r307; + .loc 1 78 53 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53 + cvt.rn.bf16x2.f32 %r48, %r316, %r315; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd26 + 0 ], { %r47, %r48 }; + // end inline asm + .loc 1 56 4 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:56:4 + ret; +$L__tmp26: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 367 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x168 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 51 +.b8 106 +.b8 98 +.b8 105 +.b8 121 +.b8 53 +.b8 122 +.b8 114 +.b8 107 +.b8 121 +.b8 109 +.b8 55 +.b8 118 +.b8 107 +.b8 110 +.b8 110 +.b8 51 +.b8 122 +.b8 105 +.b8 117 +.b8 107 +.b8 51 +.b8 113 +.b8 105 +.b8 109 +.b8 108 +.b8 98 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 50 +.b8 98 +.b8 98 +.b8 122 +.b8 51 +.b8 115 +.b8 117 +.b8 102 +.b8 54 +.b8 113 +.b8 120 +.b8 105 +.b8 106 +.b8 110 +.b8 98 +.b8 102 +.b8 99 +.b8 51 +.b8 121 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 119 +.b8 51 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 97 +.b8 116 +.b8 105 +.b8 118 +.b8 101 +.b8 95 +.b8 108 +.b8 97 +.b8 121 +.b8 101 +.b8 114 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x113:0x5f DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x128:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 51 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x140:0x31 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp25 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 80 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x158:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp24 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 243 // DW_AT_call_line +.b8 46 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.source new file mode 100644 index 0000000000000000000000000000000000000000..d7a9ab9a44a28eeacd17f6ff4828312abc8408ac --- /dev/null +++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.source @@ -0,0 +1,486 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0) +#loc88 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0) +#loc101 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0) +#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0) +#loc107 = loc(unknown) +#loc110 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0) +#loc125 = loc("in_ptr0"(#loc)) +#loc126 = loc("in_ptr1"(#loc)) +#loc127 = loc("in_ptr2"(#loc)) +#loc128 = loc("in_ptr3"(#loc)) +#loc129 = loc("in_ptr4"(#loc)) +#loc130 = loc("out_ptr0"(#loc)) +#loc131 = loc("out_ptr3"(#loc)) +#loc132 = loc("xnumel"(#loc)) +#loc133 = loc("r0_numel"(#loc)) +#loc201 = loc("value"(#loc88)) +#loc202 = loc("mean"(#loc88)) +#loc203 = loc("m2"(#loc88)) +#loc204 = loc("weight"(#loc88)) +#loc205 = loc("first_iteration"(#loc88)) +#loc215 = loc("input"(#loc101)) +#loc216 = loc("mean"(#loc105)) +#loc217 = loc("m2"(#loc105)) +#loc218 = loc("weight"(#loc105)) +#loc219 = loc("mean_1"(#loc110)) +#loc220 = loc("m2_1"(#loc110)) +#loc221 = loc("weight_1"(#loc110)) +#loc222 = loc("mean_2"(#loc110)) +#loc223 = loc("m2_2"(#loc110)) +#loc224 = loc("weight_2"(#loc110)) +#loc231 = loc("new_mean"(#loc201)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2048 : i32 loc(#loc134) + %r0_numel_1 = arith.constant 4096 : i32 loc(#loc135) + %xoffset = tt.get_program_id x : i32 loc(#loc136) + %xoffset_2 = arith.constant 1 : i32 loc(#loc137) + %xoffset_3 = arith.constant 1 : i32 loc(#loc137) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc137) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc138) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc139) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc140) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc140) + %xmask = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc141) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc141) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc142) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc143) + %tmp7_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc144) + %tmp7_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc145) + %tmp7_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc146) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %tmp7_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp7_mean_13 = %tmp7_mean, %tmp7_m2_14 = %tmp7_m2, %tmp7_weight_15 = %tmp7_weight) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc148) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc148) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc149) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc149) + %tmp0 = arith.constant 4096 : i32 loc(#loc150) + %tmp0_18 = arith.constant 4096 : i32 loc(#loc150) + %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc150) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc150) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc151) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc151) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc152) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc152) + %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc153) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc153) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc154) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc154) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc154) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc154) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc155) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc156) + %tmp1_32 = tt.addptr %tmp1, %r0_index_16 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc156) + %tmp1_33 = arith.constant 0.000000e+00 : f32 loc(#loc157) + %tmp1_34 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc157) + %tmp1_35 = arith.truncf %tmp1_34 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc157) + %tmp1_36 = tt.load %tmp1_32, %r0_mask_17, %tmp1_35 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc157) + %tmp1_37 = arith.extf %tmp1_36 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc158) + %tmp2 = arith.constant 4096 : i32 loc(#loc159) + %tmp2_38 = arith.constant 4096 : i32 loc(#loc159) + %tmp2_39 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc159) + %tmp2_40 = arith.muli %tmp2_39, %xindex_7 : tensor<1x1xi32> loc(#loc159) + %tmp2_41 = tt.broadcast %tmp2_40 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc160) + %tmp2_42 = arith.addi %r0_index_16, %tmp2_41 : tensor<1x2048xi32> loc(#loc160) + %tmp2_43 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc161) + %tmp2_44 = tt.addptr %tmp2_43, %tmp2_42 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc161) + %tmp2_45 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc162) + %tmp2_46 = arith.andi %r0_mask_17, %tmp2_45 : tensor<1x2048xi1> loc(#loc162) + %tmp2_47 = arith.constant 0.000000e+00 : f32 loc(#loc163) + %tmp2_48 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc163) + %tmp2_49 = arith.truncf %tmp2_48 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc163) + %tmp2_50 = tt.load %tmp2_44, %tmp2_46, %tmp2_49 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc163) + %tmp2_51 = arith.extf %tmp2_50 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc164) + %tmp3 = arith.mulf %tmp1_37, %tmp2_51 : tensor<1x2048xf32> loc(#loc165) + %tmp4 = arith.addf %tmp0_31, %tmp3 : tensor<1x2048xf32> loc(#loc166) + %c0_i32_52 = arith.constant 0 : i32 loc(#loc34) + %9 = arith.cmpi eq, %r0_offset, %c0_i32_52 : i32 loc(#loc34) + %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%tmp4, %tmp7_mean_13, %tmp7_m2_14, %tmp7_weight_15, %9) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, i1) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc35) + %tmp7_mean_53 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc167) + %tmp7_mean_54 = arith.andi %r0_mask_17, %tmp7_mean_53 : tensor<1x2048xi1> loc(#loc167) + %tmp7_mean_55 = arith.select %tmp7_mean_54, %10#0, %tmp7_mean_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc168) + %tmp7_m2_56 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc169) + %tmp7_m2_57 = arith.andi %r0_mask_17, %tmp7_m2_56 : tensor<1x2048xi1> loc(#loc169) + %tmp7_m2_58 = arith.select %tmp7_m2_57, %10#1, %tmp7_m2_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc170) + %tmp7_weight_59 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc171) + %tmp7_weight_60 = arith.andi %r0_mask_17, %tmp7_weight_59 : tensor<1x2048xi1> loc(#loc171) + %tmp7_weight_61 = arith.select %tmp7_weight_60, %10#2, %tmp7_weight_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc172) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc42) + %c4096_i32_62 = arith.constant 4096 : i32 loc(#loc42) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc42) + %11 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc42) + %12 = tt.broadcast %11 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc43) + %13 = arith.addi %r0_index_16, %12 : tensor<1x2048xi32> loc(#loc43) + %14 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc44) + %15 = tt.addptr %14, %13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc44) + %16 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc45) + %17 = arith.andi %r0_mask_17, %16 : tensor<1x2048xi1> loc(#loc45) + %18 = arith.truncf %tmp4 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc46) + tt.store %15, %18, %17 : tensor<1x2048x!tt.ptr> loc(#loc46) + scf.yield %tmp7_mean_55, %tmp7_m2_58, %tmp7_weight_61 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc47) + } loc(#loc237) + %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%tmp7_weight_10#0, %tmp7_weight_10#1, %tmp7_weight_10#2) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc48) + %tmp7 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc173) + %tmp11 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc174) + %tmp12 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc175) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc52) + %c2048_i32_12 = arith.constant 2048 : i32 loc(#loc52) + %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc52) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc52) + %7 = arith.bitcast %c2048_i32_12 : i32 to i32 loc(#loc52) + %8 = ub.poison : i32 loc(#loc52) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc176) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc176) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc177) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc177) + %tmp13 = arith.constant 4096 : i32 loc(#loc178) + %tmp13_15 = arith.constant 4096 : i32 loc(#loc178) + %tmp13_16 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc178) + %tmp13_17 = arith.muli %tmp13_16, %xindex_7 : tensor<1x1xi32> loc(#loc178) + %tmp13_18 = tt.broadcast %tmp13_17 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc179) + %tmp13_19 = arith.addi %r0_index_13, %tmp13_18 : tensor<1x2048xi32> loc(#loc179) + %tmp13_20 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc180) + %tmp13_21 = tt.addptr %tmp13_20, %tmp13_19 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc180) + %tmp13_22 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc181) + %tmp13_23 = arith.andi %r0_mask_14, %tmp13_22 : tensor<1x2048xi1> loc(#loc181) + %tmp13_24 = arith.constant 0.000000e+00 : f32 loc(#loc182) + %tmp13_25 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc182) + %tmp13_26 = arith.truncf %tmp13_25 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc182) + %tmp13_27 = tt.load %tmp13_21, %tmp13_23, %tmp13_26 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc182) + %tmp13_28 = arith.extf %tmp13_27 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc183) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc184) + %tmp23_29 = tt.addptr %tmp23, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc184) + %tmp23_30 = arith.constant 0.000000e+00 : f32 loc(#loc185) + %tmp23_31 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc185) + %tmp23_32 = arith.truncf %tmp23_31 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc185) + %tmp23_33 = tt.load %tmp23_29, %r0_mask_14, %tmp23_32 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc185) + %tmp23_34 = arith.extf %tmp23_33 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc186) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc187) + %tmp27_35 = tt.addptr %tmp27, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc187) + %tmp27_36 = arith.constant 0.000000e+00 : f32 loc(#loc188) + %tmp27_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc188) + %tmp27_38 = arith.truncf %tmp27_37 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc188) + %tmp27_39 = tt.load %tmp27_35, %r0_mask_14, %tmp27_38 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc188) + %tmp27_40 = arith.extf %tmp27_39 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc189) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc190) + %tmp15_41 = arith.subf %tmp13_28, %tmp15 : tensor<1x2048xf32> loc(#loc190) + %tmp16 = arith.constant 4.096000e+03 : f32 loc(#loc191) + %tmp17 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc192) + %tmp17_42 = arith.divf %tmp11, %tmp17 : tensor<1x1xf32> loc(#loc192) + %tmp18 = arith.constant 9.99999997E-7 : f32 loc(#loc193) + %tmp19 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc194) + %tmp19_43 = arith.addf %tmp17_42, %tmp19 : tensor<1x1xf32> loc(#loc194) + %tmp20 = tt.extern_elementwise %tmp19_43 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc195) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc196) + %tmp21_44 = arith.mulf %tmp15_41, %tmp21 : tensor<1x2048xf32> loc(#loc196) + %tmp24 = arith.constant 1.000000e+00 : f32 loc(#loc197) + %tmp25 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc198) + %tmp25_45 = arith.addf %tmp23_34, %tmp25 : tensor<1x2048xf32> loc(#loc198) + %tmp26 = arith.mulf %tmp21_44, %tmp25_45 : tensor<1x2048xf32> loc(#loc199) + %tmp28 = arith.addf %tmp26, %tmp27_40 : tensor<1x2048xf32> loc(#loc200) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc78) + %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc78) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc78) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc78) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc79) + %11 = arith.addi %r0_index_13, %10 : tensor<1x2048xi32> loc(#loc79) + %12 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc80) + %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc80) + %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc81) + %15 = arith.andi %r0_mask_14, %14 : tensor<1x2048xi1> loc(#loc81) + %16 = arith.truncf %tmp28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc82) + tt.store %13, %16, %15 : tensor<1x2048x!tt.ptr> loc(#loc82) + } loc(#loc52) + tt.return loc(#loc83) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc85) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc85) + tt.return %cst_0 : tensor<1x2048xf32> loc(#loc86) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc87) + tt.return %0 : tensor<1x2048xf32> loc(#loc87) + } loc(#loc84) + tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%new_mean: tensor<1x2048xf32> loc("new_mean"(#loc201)), %mean: tensor<1x2048xf32> loc("mean"(#loc88)), %m2: tensor<1x2048xf32> loc("m2"(#loc88)), %weight: tensor<1x2048xf32> loc("weight"(#loc88)), %first_iteration: i1 loc("first_iteration"(#loc88))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} { + %0:3 = scf.if %first_iteration -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) { + %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc206) + %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc232) + %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_2048S__(%m2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc233) + scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc233) + } else { + %delta = arith.subf %new_mean, %mean : tensor<1x2048xf32> loc(#loc208) + %new_weight = arith.constant 1 : i32 loc(#loc209) + %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc209) + %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc209) + %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x2048xf32> loc(#loc234) + %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x2048xf32> loc(#loc210) + %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x2048xf32> loc(#loc235) + %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x2048xf32> loc(#loc212) + %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc213) + %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x2048xf32> loc(#loc236) + scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc214) + } loc(#loc89) + tt.return %0#1, %0#0, %0#2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc99) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc100) + %2 = ub.poison : tensor<1x2048xf32> loc(#loc100) + %3 = ub.poison : tensor<1x2048xf32> loc(#loc100) + tt.return %1, %2, %3 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc100) + } loc(#loc88) + tt.func private @triton.language.standard.zeros_like__fp32S1_2048S__(%input: tensor<1x2048xf32> loc("input"(#loc101))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc102) + tt.return %0 : tensor<1x2048xf32> loc(#loc103) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc104) + tt.return %1 : tensor<1x2048xf32> loc(#loc104) + } loc(#loc101) + tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%mean: tensor<1x2048xf32> loc("mean"(#loc105)), %m2: tensor<1x2048xf32> loc("m2"(#loc105)), %weight: tensor<1x2048xf32> loc("weight"(#loc105))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({ + ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): + %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc106) + tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc106) + }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc106) + tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc108) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc109) + %2 = ub.poison : tensor<1xf32> loc(#loc109) + %3 = ub.poison : tensor<1xf32> loc(#loc109) + tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc109) + } loc(#loc105) + tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc110)), %m2_1: f32 loc("m2_1"(#loc110)), %weight_1: f32 loc("weight_1"(#loc110)), %mean_2: f32 loc("mean_2"(#loc110)), %m2_2: f32 loc("m2_2"(#loc110)), %weight_2: f32 loc("weight_2"(#loc110))) -> (f32, f32, f32) attributes {noinline = false} { + %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc225) + %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc226) + %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc227) + %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc227) + %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc228) + %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc229) + %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc229) + %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc229) + %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc116) + %1 = arith.addf %mean_1, %0 : f32 loc(#loc117) + %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc118) + %3 = arith.mulf %delta, %delta : f32 loc(#loc119) + %4 = arith.mulf %3, %weight_1 : f32 loc(#loc120) + %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc121) + %6 = arith.addf %2, %5 : f32 loc(#loc122) + tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc123) + ^bb1: // no predecessors + %7 = ub.poison : f32 loc(#loc124) + %8 = ub.poison : f32 loc(#loc124) + %9 = ub.poison : f32 loc(#loc124) + tt.return %7, %8, %9 : f32, f32, f32 loc(#loc124) + } loc(#loc110) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":29:45) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":30:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":31:47) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:46) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:61) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:62) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:51) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:39) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:37) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:58) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:41) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:41) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:36) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:62) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:8) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":55:18) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:43) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":57:31) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":58:29) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:48) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:43) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:36) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:63) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":67:16) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":69:16) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":74:16) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:41) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:36) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:63) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4) +#loc84 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0) +#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31) +#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11) +#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4) +#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46) +#loc91 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11) +#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4) +#loc102 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30) +#loc103 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11) +#loc104 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4) +#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11) +#loc109 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4) +#loc111 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc112 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc113 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc114 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc115 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc116 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc117 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc118 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc119 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc120 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc121 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc122 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc123 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11) +#loc124 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4) +#loc134 = loc("xnumel"(#loc1)) +#loc135 = loc("r0_numel"(#loc2)) +#loc136 = loc("xoffset"(#loc3)) +#loc137 = loc("xoffset"(#loc4)) +#loc138 = loc("xindex"(#loc5)) +#loc139 = loc("xindex"(#loc6)) +#loc140 = loc("xindex"(#loc7)) +#loc141 = loc("xmask"(#loc8)) +#loc142 = loc("r0_base"(#loc9)) +#loc143 = loc("r0_base"(#loc10)) +#loc144 = loc("tmp7_mean"(#loc11)) +#loc145 = loc("tmp7_m2"(#loc12)) +#loc146 = loc("tmp7_weight"(#loc13)) +#loc147 = loc("tmp7_mean"(#loc14)) +#loc148 = loc("r0_index"(#loc15)) +#loc149 = loc("r0_mask"(#loc16)) +#loc150 = loc("tmp0"(#loc17)) +#loc151 = loc("tmp0"(#loc18)) +#loc152 = loc("tmp0"(#loc19)) +#loc153 = loc("tmp0"(#loc20)) +#loc154 = loc("tmp0"(#loc21)) +#loc155 = loc("tmp0"(#loc22)) +#loc156 = loc("tmp1"(#loc23)) +#loc157 = loc("tmp1"(#loc24)) +#loc158 = loc("tmp1"(#loc25)) +#loc159 = loc("tmp2"(#loc26)) +#loc160 = loc("tmp2"(#loc27)) +#loc161 = loc("tmp2"(#loc28)) +#loc162 = loc("tmp2"(#loc29)) +#loc163 = loc("tmp2"(#loc30)) +#loc164 = loc("tmp2"(#loc31)) +#loc165 = loc("tmp3"(#loc32)) +#loc166 = loc("tmp4"(#loc33)) +#loc167 = loc("tmp7_mean"(#loc36)) +#loc168 = loc("tmp7_mean"(#loc37)) +#loc169 = loc("tmp7_m2"(#loc38)) +#loc170 = loc("tmp7_m2"(#loc39)) +#loc171 = loc("tmp7_weight"(#loc40)) +#loc172 = loc("tmp7_weight"(#loc41)) +#loc173 = loc("tmp7"(#loc49)) +#loc174 = loc("tmp11"(#loc50)) +#loc175 = loc("tmp12"(#loc51)) +#loc176 = loc("r0_index"(#loc53)) +#loc177 = loc("r0_mask"(#loc54)) +#loc178 = loc("tmp13"(#loc55)) +#loc179 = loc("tmp13"(#loc56)) +#loc180 = loc("tmp13"(#loc57)) +#loc181 = loc("tmp13"(#loc58)) +#loc182 = loc("tmp13"(#loc59)) +#loc183 = loc("tmp13"(#loc60)) +#loc184 = loc("tmp23"(#loc61)) +#loc185 = loc("tmp23"(#loc62)) +#loc186 = loc("tmp23"(#loc63)) +#loc187 = loc("tmp27"(#loc64)) +#loc188 = loc("tmp27"(#loc65)) +#loc189 = loc("tmp27"(#loc66)) +#loc190 = loc("tmp15"(#loc67)) +#loc191 = loc("tmp16"(#loc68)) +#loc192 = loc("tmp17"(#loc69)) +#loc193 = loc("tmp18"(#loc70)) +#loc194 = loc("tmp19"(#loc71)) +#loc195 = loc("tmp20"(#loc72)) +#loc196 = loc("tmp21"(#loc73)) +#loc197 = loc("tmp24"(#loc74)) +#loc198 = loc("tmp25"(#loc75)) +#loc199 = loc("tmp26"(#loc76)) +#loc200 = loc("tmp28"(#loc77)) +#loc206 = loc("new_weight"(#loc90)) +#loc207 = loc("new_m2"(#loc91)) +#loc208 = loc("delta"(#loc92)) +#loc209 = loc("new_weight"(#loc93)) +#loc210 = loc("new_mean"(#loc94)) +#loc211 = loc("new_mean"(#loc95)) +#loc212 = loc("new_m2"(#loc96)) +#loc213 = loc("new_m2"(#loc97)) +#loc214 = loc("new_m2"(#loc98)) +#loc225 = loc("delta"(#loc111)) +#loc226 = loc("new_weight"(#loc112)) +#loc227 = loc("w2_over_w"(#loc113)) +#loc228 = loc("w2_over_w"(#loc114)) +#loc229 = loc("w2_over_w"(#loc115)) +#loc230 = loc("tmp7_m2"(#loc147)) +#loc232 = loc("new_weight"(#loc206)) +#loc233 = loc("new_m2"(#loc207)) +#loc234 = loc("new_weight"(#loc209)) +#loc235 = loc("new_mean"(#loc211)) +#loc236 = loc("new_m2"(#loc214)) +#loc237 = loc("tmp7_weight"(#loc230)) diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..86151ad8e109b79e9f243cc9b35ae3fc961afe98 --- /dev/null +++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttgir @@ -0,0 +1,295 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0) +#loc1 = loc(unknown) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80) +#loc80 = loc("in_ptr0"(#loc)) +#loc81 = loc("in_ptr1"(#loc)) +#loc82 = loc("in_ptr2"(#loc)) +#loc83 = loc("in_ptr3"(#loc)) +#loc84 = loc("in_ptr4"(#loc)) +#loc85 = loc("out_ptr0"(#loc)) +#loc86 = loc("out_ptr3"(#loc)) +#loc87 = loc("xnumel"(#loc)) +#loc88 = loc("r0_numel"(#loc)) +#loc122 = loc(callsite(#loc1 at #loc40)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1x2048xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc89) + %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc90) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc91) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc91) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc92) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc151) + %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc94) + %tmp0_9 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc152) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc96) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc97) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc11) + %tmp7_weight:3 = scf.for %tmp7_weight_10 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg10 = %cst_2, %arg11 = %cst_2, %arg12 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) : i32 { + %r0_index = tt.splat %tmp7_weight_10 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc99) + %r0_index_11 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc99) + %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc100) + %tmp0_12 = arith.addi %r0_index_11, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc93) + %tmp0_13 = tt.addptr %tmp0_8, %tmp0_12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc94) + %tmp0_14 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc95) + %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc101) + %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc102) + %tmp1_17 = tt.addptr %tmp1, %r0_index_11 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc96) + %tmp1_18 = tt.load %tmp1_17, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc103) + %tmp1_19 = arith.extf %tmp1_18 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc104) + %tmp2_20 = tt.addptr %tmp2, %tmp0_12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc97) + %tmp2_21 = tt.load %tmp2_20, %tmp0_14, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc105) + %tmp2_22 = arith.extf %tmp2_21 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc106) + %tmp3 = arith.mulf %tmp1_19, %tmp2_22 : tensor<1x2048xf32, #blocked> loc(#loc107) + %tmp4 = arith.addf %tmp0_16, %tmp3 : tensor<1x2048xf32, #blocked> loc(#loc108) + %3 = arith.cmpi eq, %tmp7_weight_10, %c0_i32 : i32 loc(#loc23) + %4:3 = scf.if %3 -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) { + scf.yield %cst_2, %tmp4, %cst_3 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc176) + } else { + %delta = arith.subf %tmp4, %arg10 : tensor<1x2048xf32, #blocked> loc(#loc155) + %new_weight = arith.addf %arg12, %cst_3 : tensor<1x2048xf32, #blocked> loc(#loc177) + %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32, #blocked> loc(#loc157) + %new_mean_24 = arith.addf %arg10, %new_mean : tensor<1x2048xf32, #blocked> loc(#loc178) + %new_m2 = arith.subf %tmp4, %new_mean_24 : tensor<1x2048xf32, #blocked> loc(#loc159) + %new_m2_25 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32, #blocked> loc(#loc160) + %new_m2_26 = arith.addf %arg11, %new_m2_25 : tensor<1x2048xf32, #blocked> loc(#loc179) + scf.yield %new_m2_26, %new_mean_24, %new_weight : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc162) + } loc(#loc109) + %tmp7_mean = arith.select %tmp0_14, %4#1, %arg10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc118) + %tmp7_m2 = arith.select %tmp0_14, %4#0, %arg11 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc119) + %tmp7_weight_23 = arith.select %tmp0_14, %4#2, %arg12 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc120) + %5 = tt.addptr %0, %tmp0_12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc11) + %6 = arith.truncf %tmp4 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc37) + tt.store %5, %6, %tmp0_14 : tensor<1x2048x!tt.ptr, #blocked> loc(#loc37) + scf.yield %tmp7_mean, %tmp7_m2, %tmp7_weight_23 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc38) + } loc(#loc175) + %1:3 = "tt.reduce"(%tmp7_weight#0, %tmp7_weight#1, %tmp7_weight#2) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc40)), %arg10: f32 loc(callsite(#loc1 at #loc40)), %arg11: f32 loc(callsite(#loc1 at #loc40)), %arg12: f32 loc(callsite(#loc1 at #loc40)), %arg13: f32 loc(callsite(#loc1 at #loc40)), %arg14: f32 loc(callsite(#loc1 at #loc40))): + %delta = arith.subf %arg12, %arg9 : f32 loc(#loc163) + %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc164) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc165) + %w2_over_w_10 = arith.divf %arg14, %new_weight : f32 loc(#loc166) + %w2_over_w_11 = arith.select %w2_over_w, %cst_1, %w2_over_w_10 : f32 loc(#loc167) + %3 = arith.mulf %delta, %w2_over_w_11 : f32 loc(#loc168) + %4 = arith.addf %arg9, %3 : f32 loc(#loc169) + %5 = arith.addf %arg10, %arg13 : f32 loc(#loc170) + %6 = arith.mulf %delta, %delta : f32 loc(#loc171) + %7 = arith.mulf %6, %arg11 : f32 loc(#loc172) + %8 = arith.mulf %7, %w2_over_w_11 : f32 loc(#loc173) + %9 = arith.addf %5, %8 : f32 loc(#loc174) + tt.reduce.return %4, %9, %new_weight : f32, f32, f32 loc(#loc121) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc121) + %tmp7 = tt.expand_dims %1#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc128) + %tmp11 = tt.expand_dims %1#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc129) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc130) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc131) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc132) + %tmp17 = arith.divf %tmp11, %cst_5 : tensor<1x1xf32, #blocked> loc(#loc133) + %tmp19 = arith.addf %tmp17, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc134) + %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc135) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc136) + %2 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc62) + scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc137) + %r0_index_10 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc137) + %r0_mask = arith.cmpi slt, %r0_index_10, %cst : tensor<1x2048xi32, #blocked> loc(#loc138) + %tmp13 = arith.addi %r0_index_10, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc139) + %tmp13_11 = tt.addptr %0, %tmp13 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc140) + %tmp13_12 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc141) + %tmp13_13 = tt.load %tmp13_11, %tmp13_12, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc142) + %tmp13_14 = arith.extf %tmp13_13 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc143) + %tmp23_15 = tt.addptr %tmp23, %r0_index_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc130) + %tmp23_16 = tt.load %tmp23_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc144) + %tmp23_17 = arith.extf %tmp23_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc145) + %tmp27_18 = tt.addptr %tmp27, %r0_index_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc131) + %tmp27_19 = tt.load %tmp27_18, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc146) + %tmp27_20 = arith.extf %tmp27_19 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc147) + %tmp15_21 = arith.subf %tmp13_14, %tmp15 : tensor<1x2048xf32, #blocked> loc(#loc132) + %tmp21_22 = arith.mulf %tmp15_21, %tmp21 : tensor<1x2048xf32, #blocked> loc(#loc136) + %tmp25 = arith.addf %tmp23_17, %cst_3 : tensor<1x2048xf32, #blocked> loc(#loc148) + %tmp26 = arith.mulf %tmp21_22, %tmp25 : tensor<1x2048xf32, #blocked> loc(#loc149) + %tmp28 = arith.addf %tmp26, %tmp27_20 : tensor<1x2048xf32, #blocked> loc(#loc150) + %3 = tt.addptr %2, %tmp13 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc62) + %4 = arith.truncf %tmp28 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc78) + tt.store %3, %4, %tmp13_12 : tensor<1x2048x!tt.ptr, #blocked> loc(#loc78) + } loc(#loc63) + tt.return loc(#loc79) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":32:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":33:31) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:62) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:51) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:58) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:8) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc44 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc45 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc46 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc47 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc48 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc49 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc50 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc51 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc52 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:43) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":57:31) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":58:29) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:43) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:36) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:63) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4) +#loc89 = loc("xoffset"(#loc2)) +#loc90 = loc("xmask"(#loc3)) +#loc91 = loc("r0_base"(#loc4)) +#loc92 = loc("tmp0"(#loc5)) +#loc93 = loc("tmp0"(#loc6)) +#loc94 = loc("tmp0"(#loc7)) +#loc95 = loc("tmp0"(#loc8)) +#loc96 = loc("tmp1"(#loc9)) +#loc97 = loc("tmp2"(#loc10)) +#loc98 = loc("tmp7_mean"(#loc12)) +#loc99 = loc("r0_index"(#loc13)) +#loc100 = loc("r0_mask"(#loc14)) +#loc101 = loc("tmp0"(#loc15)) +#loc102 = loc("tmp0"(#loc16)) +#loc103 = loc("tmp1"(#loc17)) +#loc104 = loc("tmp1"(#loc18)) +#loc105 = loc("tmp2"(#loc19)) +#loc106 = loc("tmp2"(#loc20)) +#loc107 = loc("tmp3"(#loc21)) +#loc108 = loc("tmp4"(#loc22)) +#loc109 = loc(callsite(#loc24 at #loc25)) +#loc110 = loc("new_m2"(#loc26)) +#loc111 = loc("delta"(#loc27)) +#loc112 = loc("new_weight"(#loc28)) +#loc113 = loc("new_mean"(#loc29)) +#loc114 = loc("new_mean"(#loc30)) +#loc115 = loc("new_m2"(#loc31)) +#loc116 = loc("new_m2"(#loc32)) +#loc117 = loc("new_m2"(#loc33)) +#loc118 = loc("tmp7_mean"(#loc34)) +#loc119 = loc("tmp7_m2"(#loc35)) +#loc120 = loc("tmp7_weight"(#loc36)) +#loc121 = loc(callsite(#loc39 at #loc40)) +#loc123 = loc("delta"(#loc41)) +#loc124 = loc("new_weight"(#loc42)) +#loc125 = loc("w2_over_w"(#loc43)) +#loc126 = loc("w2_over_w"(#loc44)) +#loc127 = loc("w2_over_w"(#loc45)) +#loc128 = loc("tmp7"(#loc53)) +#loc129 = loc("tmp11"(#loc54)) +#loc130 = loc("tmp23"(#loc55)) +#loc131 = loc("tmp27"(#loc56)) +#loc132 = loc("tmp15"(#loc57)) +#loc133 = loc("tmp17"(#loc58)) +#loc134 = loc("tmp19"(#loc59)) +#loc135 = loc("tmp20"(#loc60)) +#loc136 = loc("tmp21"(#loc61)) +#loc137 = loc("r0_index"(#loc64)) +#loc138 = loc("r0_mask"(#loc65)) +#loc139 = loc("tmp13"(#loc66)) +#loc140 = loc("tmp13"(#loc67)) +#loc141 = loc("tmp13"(#loc68)) +#loc142 = loc("tmp13"(#loc69)) +#loc143 = loc("tmp13"(#loc70)) +#loc144 = loc("tmp23"(#loc71)) +#loc145 = loc("tmp23"(#loc72)) +#loc146 = loc("tmp27"(#loc73)) +#loc147 = loc("tmp27"(#loc74)) +#loc148 = loc("tmp25"(#loc75)) +#loc149 = loc("tmp26"(#loc76)) +#loc150 = loc("tmp28"(#loc77)) +#loc151 = loc(fused[#loc93, #loc92]) +#loc152 = loc(fused[#loc95, #loc90]) +#loc153 = loc("tmp7_m2"(#loc98)) +#loc154 = loc("new_m2"(#loc110)) +#loc155 = loc(callsite(#loc111 at #loc25)) +#loc156 = loc("new_weight"(#loc112)) +#loc157 = loc(callsite(#loc113 at #loc25)) +#loc158 = loc("new_mean"(#loc114)) +#loc159 = loc(callsite(#loc115 at #loc25)) +#loc160 = loc(callsite(#loc116 at #loc25)) +#loc161 = loc("new_m2"(#loc117)) +#loc162 = loc(callsite(#loc117 at #loc25)) +#loc163 = loc(callsite(#loc123 at #loc121)) +#loc164 = loc(callsite(#loc124 at #loc121)) +#loc165 = loc(callsite(#loc125 at #loc121)) +#loc166 = loc(callsite(#loc126 at #loc121)) +#loc167 = loc(callsite(#loc127 at #loc121)) +#loc168 = loc(callsite(#loc46 at #loc121)) +#loc169 = loc(callsite(#loc47 at #loc121)) +#loc170 = loc(callsite(#loc48 at #loc121)) +#loc171 = loc(callsite(#loc49 at #loc121)) +#loc172 = loc(callsite(#loc50 at #loc121)) +#loc173 = loc(callsite(#loc51 at #loc121)) +#loc174 = loc(callsite(#loc52 at #loc121)) +#loc175 = loc("tmp7_weight"(#loc153)) +#loc176 = loc(callsite(#loc154 at #loc25)) +#loc177 = loc(callsite(#loc156 at #loc25)) +#loc178 = loc(callsite(#loc158 at #loc25)) +#loc179 = loc(callsite(#loc161 at #loc25)) diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..75f9aeef0da4f8198219a446a62130a5cb3727ec --- /dev/null +++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttir @@ -0,0 +1,304 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80) +#loc82 = loc("in_ptr0"(#loc)) +#loc83 = loc("in_ptr1"(#loc)) +#loc84 = loc("in_ptr2"(#loc)) +#loc85 = loc("in_ptr3"(#loc)) +#loc86 = loc("in_ptr4"(#loc)) +#loc87 = loc("out_ptr0"(#loc)) +#loc88 = loc("out_ptr3"(#loc)) +#loc89 = loc("xnumel"(#loc)) +#loc90 = loc("r0_numel"(#loc)) +#loc91 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc91) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1) + %cst_5 = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc92) + %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc93) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc94) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc95) + %tmp7_weight:3 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%tmp7_mean = %cst_0, %tmp7_m2 = %cst_0, %tmp7_weight_7 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc97) + %r0_index_8 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32> loc(#loc97) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst_5 : tensor<1x2048xi32> loc(#loc98) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc99) + %tmp0_9 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc156) + %tmp0_10 = arith.addi %r0_index_8, %tmp0_9 : tensor<1x2048xi32> loc(#loc100) + %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc101) + %tmp0_12 = tt.addptr %tmp0_11, %tmp0_10 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc101) + %tmp0_13 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc157) + %tmp0_14 = arith.andi %r0_mask, %tmp0_13 : tensor<1x2048xi1> loc(#loc102) + %tmp0_15 = tt.load %tmp0_12, %tmp0_14, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc103) + %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc104) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc105) + %tmp1_17 = tt.addptr %tmp1, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc105) + %tmp1_18 = tt.load %tmp1_17, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc106) + %tmp1_19 = arith.extf %tmp1_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc107) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc108) + %tmp2_20 = tt.addptr %tmp2, %tmp0_10 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc108) + %tmp2_21 = tt.load %tmp2_20, %tmp0_14, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc109) + %tmp2_22 = arith.extf %tmp2_21 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc110) + %tmp3 = arith.mulf %tmp1_19, %tmp2_22 : tensor<1x2048xf32> loc(#loc111) + %tmp4 = arith.addf %tmp0_16, %tmp3 : tensor<1x2048xf32> loc(#loc112) + %1 = arith.cmpi eq, %r0_offset, %c0_i32 : i32 loc(#loc24) + %2:3 = scf.if %1 -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) { + scf.yield %cst_0, %tmp4, %cst_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc182) + } else { + %delta = arith.subf %tmp4, %tmp7_mean : tensor<1x2048xf32> loc(#loc159) + %new_weight = arith.addf %tmp7_weight_7, %cst_2 : tensor<1x2048xf32> loc(#loc183) + %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32> loc(#loc161) + %new_mean_26 = arith.addf %tmp7_mean, %new_mean : tensor<1x2048xf32> loc(#loc184) + %new_m2 = arith.subf %tmp4, %new_mean_26 : tensor<1x2048xf32> loc(#loc163) + %new_m2_27 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc164) + %new_m2_28 = arith.addf %tmp7_m2, %new_m2_27 : tensor<1x2048xf32> loc(#loc185) + scf.yield %new_m2_28, %new_mean_26, %new_weight : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc166) + } loc(#loc113) + %tmp7_mean_23 = arith.select %tmp0_14, %2#1, %tmp7_mean : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc122) + %tmp7_m2_24 = arith.select %tmp0_14, %2#0, %tmp7_m2 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc123) + %tmp7_weight_25 = arith.select %tmp0_14, %2#2, %tmp7_weight_7 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc124) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc38) + %4 = tt.addptr %3, %tmp0_10 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc38) + %5 = arith.truncf %tmp4 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc39) + tt.store %4, %5, %tmp0_14 : tensor<1x2048x!tt.ptr> loc(#loc39) + scf.yield %tmp7_mean_23, %tmp7_m2_24, %tmp7_weight_25 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc40) + } loc(#loc181) + %0:3 = "tt.reduce"(%tmp7_weight#0, %tmp7_weight#1, %tmp7_weight#2) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc2)), %arg10: f32 loc(callsite(#loc1 at #loc2)), %arg11: f32 loc(callsite(#loc1 at #loc2)), %arg12: f32 loc(callsite(#loc1 at #loc2)), %arg13: f32 loc(callsite(#loc1 at #loc2)), %arg14: f32 loc(callsite(#loc1 at #loc2))): + %delta = arith.subf %arg12, %arg9 : f32 loc(#loc167) + %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc168) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc169) + %w2_over_w_7 = arith.divf %arg14, %new_weight : f32 loc(#loc170) + %w2_over_w_8 = arith.select %w2_over_w, %cst, %w2_over_w_7 : f32 loc(#loc171) + %1 = arith.mulf %delta, %w2_over_w_8 : f32 loc(#loc172) + %2 = arith.addf %arg9, %1 : f32 loc(#loc173) + %3 = arith.addf %arg10, %arg13 : f32 loc(#loc174) + %4 = arith.mulf %delta, %delta : f32 loc(#loc175) + %5 = arith.mulf %4, %arg11 : f32 loc(#loc176) + %6 = arith.mulf %5, %w2_over_w_8 : f32 loc(#loc177) + %7 = arith.addf %3, %6 : f32 loc(#loc178) + tt.reduce.return %2, %7, %new_weight : f32, f32, f32 loc(#loc125) + }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc125) + %tmp7 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc131) + %tmp11 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc132) + scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc133) + %r0_index_7 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32> loc(#loc133) + %r0_mask = arith.cmpi slt, %r0_index_7, %cst_5 : tensor<1x2048xi32> loc(#loc134) + %tmp13 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc135) + %tmp13_8 = tt.splat %tmp13 : i32 -> tensor<1x2048xi32> loc(#loc179) + %tmp13_9 = arith.addi %r0_index_7, %tmp13_8 : tensor<1x2048xi32> loc(#loc136) + %tmp13_10 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc137) + %tmp13_11 = tt.addptr %tmp13_10, %tmp13_9 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc137) + %tmp13_12 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc180) + %tmp13_13 = arith.andi %r0_mask, %tmp13_12 : tensor<1x2048xi1> loc(#loc138) + %tmp13_14 = tt.load %tmp13_11, %tmp13_13, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc139) + %tmp13_15 = arith.extf %tmp13_14 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc140) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc141) + %tmp23_16 = tt.addptr %tmp23, %r0_index_7 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc141) + %tmp23_17 = tt.load %tmp23_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc142) + %tmp23_18 = arith.extf %tmp23_17 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc143) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc144) + %tmp27_19 = tt.addptr %tmp27, %r0_index_7 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc144) + %tmp27_20 = tt.load %tmp27_19, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc145) + %tmp27_21 = arith.extf %tmp27_20 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc146) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc147) + %tmp15_22 = arith.subf %tmp13_15, %tmp15 : tensor<1x2048xf32> loc(#loc147) + %tmp17 = arith.divf %tmp11, %cst_4 : tensor<1x1xf32> loc(#loc148) + %tmp19 = arith.addf %tmp17, %cst_3 : tensor<1x1xf32> loc(#loc149) + %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc150) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc151) + %tmp21_23 = arith.mulf %tmp15_22, %tmp21 : tensor<1x2048xf32> loc(#loc151) + %tmp25 = arith.addf %tmp23_18, %cst_2 : tensor<1x2048xf32> loc(#loc152) + %tmp26 = arith.mulf %tmp21_23, %tmp25 : tensor<1x2048xf32> loc(#loc153) + %tmp28 = arith.addf %tmp26, %tmp27_21 : tensor<1x2048xf32> loc(#loc154) + %1 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc79) + %2 = tt.addptr %1, %tmp13_9 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc79) + %3 = arith.truncf %tmp28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc80) + tt.store %2, %3, %tmp13_13 : tensor<1x2048x!tt.ptr> loc(#loc80) + } loc(#loc56) + tt.return loc(#loc81) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:27) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":32:43) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":33:31) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:62) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:51) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:58) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:8) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc44 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc45 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc46 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc47 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc48 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc49 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc50 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc51 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc52 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc53 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:43) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":57:31) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":58:29) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:48) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:43) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:36) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:63) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4) +#loc92 = loc("xoffset"(#loc3)) +#loc93 = loc("xmask"(#loc4)) +#loc94 = loc("r0_base"(#loc5)) +#loc95 = loc("r0_base"(#loc6)) +#loc96 = loc("tmp7_mean"(#loc7)) +#loc97 = loc("r0_index"(#loc8)) +#loc98 = loc("r0_mask"(#loc9)) +#loc99 = loc("tmp0"(#loc10)) +#loc100 = loc("tmp0"(#loc11)) +#loc101 = loc("tmp0"(#loc12)) +#loc102 = loc("tmp0"(#loc13)) +#loc103 = loc("tmp0"(#loc14)) +#loc104 = loc("tmp0"(#loc15)) +#loc105 = loc("tmp1"(#loc16)) +#loc106 = loc("tmp1"(#loc17)) +#loc107 = loc("tmp1"(#loc18)) +#loc108 = loc("tmp2"(#loc19)) +#loc109 = loc("tmp2"(#loc20)) +#loc110 = loc("tmp2"(#loc21)) +#loc111 = loc("tmp3"(#loc22)) +#loc112 = loc("tmp4"(#loc23)) +#loc113 = loc(callsite(#loc25 at #loc26)) +#loc114 = loc("new_m2"(#loc27)) +#loc115 = loc("delta"(#loc28)) +#loc116 = loc("new_weight"(#loc29)) +#loc117 = loc("new_mean"(#loc30)) +#loc118 = loc("new_mean"(#loc31)) +#loc119 = loc("new_m2"(#loc32)) +#loc120 = loc("new_m2"(#loc33)) +#loc121 = loc("new_m2"(#loc34)) +#loc122 = loc("tmp7_mean"(#loc35)) +#loc123 = loc("tmp7_m2"(#loc36)) +#loc124 = loc("tmp7_weight"(#loc37)) +#loc125 = loc(callsite(#loc41 at #loc2)) +#loc126 = loc("delta"(#loc42)) +#loc127 = loc("new_weight"(#loc43)) +#loc128 = loc("w2_over_w"(#loc44)) +#loc129 = loc("w2_over_w"(#loc45)) +#loc130 = loc("w2_over_w"(#loc46)) +#loc131 = loc("tmp7"(#loc54)) +#loc132 = loc("tmp11"(#loc55)) +#loc133 = loc("r0_index"(#loc57)) +#loc134 = loc("r0_mask"(#loc58)) +#loc135 = loc("tmp13"(#loc59)) +#loc136 = loc("tmp13"(#loc60)) +#loc137 = loc("tmp13"(#loc61)) +#loc138 = loc("tmp13"(#loc62)) +#loc139 = loc("tmp13"(#loc63)) +#loc140 = loc("tmp13"(#loc64)) +#loc141 = loc("tmp23"(#loc65)) +#loc142 = loc("tmp23"(#loc66)) +#loc143 = loc("tmp23"(#loc67)) +#loc144 = loc("tmp27"(#loc68)) +#loc145 = loc("tmp27"(#loc69)) +#loc146 = loc("tmp27"(#loc70)) +#loc147 = loc("tmp15"(#loc71)) +#loc148 = loc("tmp17"(#loc72)) +#loc149 = loc("tmp19"(#loc73)) +#loc150 = loc("tmp20"(#loc74)) +#loc151 = loc("tmp21"(#loc75)) +#loc152 = loc("tmp25"(#loc76)) +#loc153 = loc("tmp26"(#loc77)) +#loc154 = loc("tmp28"(#loc78)) +#loc155 = loc("tmp7_m2"(#loc96)) +#loc156 = loc(fused[#loc100, #loc99]) +#loc157 = loc(fused[#loc102, #loc93]) +#loc158 = loc("new_m2"(#loc114)) +#loc159 = loc(callsite(#loc115 at #loc26)) +#loc160 = loc("new_weight"(#loc116)) +#loc161 = loc(callsite(#loc117 at #loc26)) +#loc162 = loc("new_mean"(#loc118)) +#loc163 = loc(callsite(#loc119 at #loc26)) +#loc164 = loc(callsite(#loc120 at #loc26)) +#loc165 = loc("new_m2"(#loc121)) +#loc166 = loc(callsite(#loc121 at #loc26)) +#loc167 = loc(callsite(#loc126 at #loc125)) +#loc168 = loc(callsite(#loc127 at #loc125)) +#loc169 = loc(callsite(#loc128 at #loc125)) +#loc170 = loc(callsite(#loc129 at #loc125)) +#loc171 = loc(callsite(#loc130 at #loc125)) +#loc172 = loc(callsite(#loc47 at #loc125)) +#loc173 = loc(callsite(#loc48 at #loc125)) +#loc174 = loc(callsite(#loc49 at #loc125)) +#loc175 = loc(callsite(#loc50 at #loc125)) +#loc176 = loc(callsite(#loc51 at #loc125)) +#loc177 = loc(callsite(#loc52 at #loc125)) +#loc178 = loc(callsite(#loc53 at #loc125)) +#loc179 = loc(fused[#loc136, #loc135]) +#loc180 = loc(fused[#loc138, #loc93]) +#loc181 = loc("tmp7_weight"(#loc155)) +#loc182 = loc(callsite(#loc158 at #loc26)) +#loc183 = loc(callsite(#loc160 at #loc26)) +#loc184 = loc(callsite(#loc162 at #loc26)) +#loc185 = loc(callsite(#loc165 at #loc26)) diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/__grp__triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..91001eafcf9376649341a7595586a1e1ba90dd58 --- /dev/null +++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/__grp__triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.json"}} \ No newline at end of file diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..74d4e7c86cbf21f7af7b1dedb57d9062a49f139f Binary files /dev/null and b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.cubin differ diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.json b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3a1b71fcd088655b6bbe99a00b830e53c565fd0e --- /dev/null +++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"hash": "212d5b26ed6e7952d6909b530fb9c25e23f448d78cecb7f927aef2cef01d405a", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 32, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"} \ No newline at end of file diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.llir b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..085e37c7d4367276f4f4c95ae5d98ddd15eaa068 --- /dev/null +++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.llir @@ -0,0 +1,136 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 3, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 224, !dbg !9 + %11 = lshr exact i32 %10, 5, !dbg !9 + %12 = and i32 %9, 7, !dbg !9 + %13 = or disjoint i32 %11, %8, !dbg !10 + %14 = or disjoint i32 %8, %12, !dbg !10 + %15 = shl nuw nsw i32 %9, 2, !dbg !11 + %16 = and i32 %15, 124, !dbg !11 + %17 = sdiv i32 %13, 32, !dbg !12 + %18 = mul i32 %17, 32, !dbg !13 + %.decomposed = sub i32 %13, %18, !dbg !13 + %19 = shl nsw i32 %.decomposed, 7, !dbg !14 + %20 = or disjoint i32 %19, %16, !dbg !15 + %21 = mul i32 %17, 12288, !dbg !16 + %22 = add i32 %20, %21, !dbg !17 + %23 = sext i32 %22 to i64, !dbg !18 + %24 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18 + %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %24, i64 %25, i1 true) #4, !dbg !19 + %27 = extractvalue { i32, i32 } %26, 0, !dbg !19 + %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !19 + %29 = extractvalue { i32, i32 } %26, 1, !dbg !19 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !19 + %31 = extractelement <2 x bfloat> %28, i64 0, !dbg !19 + %32 = extractelement <2 x bfloat> %28, i64 1, !dbg !19 + %33 = extractelement <2 x bfloat> %30, i64 0, !dbg !19 + %34 = extractelement <2 x bfloat> %30, i64 1, !dbg !19 + %35 = fpext bfloat %31 to float, !dbg !20 + %36 = fpext bfloat %32 to float, !dbg !20 + %37 = fpext bfloat %33 to float, !dbg !20 + %38 = fpext bfloat %34 to float, !dbg !20 + %39 = fmul float %35, %35, !dbg !21 + %40 = fmul float %36, %36, !dbg !21 + %41 = fmul float %37, %37, !dbg !21 + %42 = fmul float %38, %38, !dbg !21 + %43 = fadd float %39, %40, !dbg !22 + %44 = fadd float %41, %43, !dbg !22 + %45 = fadd float %42, %44, !dbg !22 + %46 = bitcast float %45 to i32, !dbg !25 + %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %46, i32 16, i32 31), !dbg !25 + %48 = bitcast i32 %47 to float, !dbg !25 + %49 = fadd float %45, %48, !dbg !22 + %50 = bitcast float %49 to i32, !dbg !25 + %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %50, i32 8, i32 31), !dbg !25 + %52 = bitcast i32 %51 to float, !dbg !25 + %53 = fadd float %49, %52, !dbg !22 + %54 = bitcast float %53 to i32, !dbg !25 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 4, i32 31), !dbg !25 + %56 = bitcast i32 %55 to float, !dbg !25 + %57 = fadd float %53, %56, !dbg !22 + %58 = bitcast float %57 to i32, !dbg !25 + %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %58, i32 2, i32 31), !dbg !25 + %60 = bitcast i32 %59 to float, !dbg !25 + %61 = fadd float %57, %60, !dbg !22 + %62 = bitcast float %61 to i32, !dbg !25 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 1, i32 31), !dbg !25 + %64 = bitcast i32 %63 to float, !dbg !25 + %65 = fadd float %61, %64, !dbg !22 + %66 = lshr exact i32 %10, 3, !dbg !28 + %67 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %66, !dbg !28 + store float %65, ptr addrspace(3) %67, align 4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %68 = shl nuw nsw i32 %12, 2, !dbg !28 + %69 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %68, !dbg !28 + %70 = load i32, ptr addrspace(3) %69, align 4, !dbg !28 + %71 = sext i32 %14 to i64, !dbg !29 + %72 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !29 + %73 = and i32 %9, 248, !dbg !30 + %74 = icmp eq i32 %73, 0, !dbg !30 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %70, ptr addrspace(1) %72, i1 %74) #4, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 41, scope: !4) +!16 = !DILocation(line: 38, column: 56, scope: !4) +!17 = !DILocation(line: 38, column: 50, scope: !4) +!18 = !DILocation(line: 38, column: 34, scope: !4) +!19 = !DILocation(line: 38, column: 61, scope: !4) +!20 = !DILocation(line: 38, column: 115, scope: !4) +!21 = !DILocation(line: 40, column: 22, scope: !4) +!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26) +!26 = !DILocation(line: 44, column: 25, scope: !27) +!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!28 = !DILocation(line: 44, column: 28, scope: !4) +!29 = !DILocation(line: 45, column: 25, scope: !4) +!30 = !DILocation(line: 45, column: 36, scope: !4) +!31 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0f09fb949704c6303aee5f1f135349deac362169 --- /dev/null +++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ptx @@ -0,0 +1,506 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_0 +.visible .entry triton_red_fused__fused_rms_norm_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5 +) +.reqntid 256 +{ + .reg .pred %p<3>; + .reg .b16 %rs<5>; + .reg .b32 %r<48>; + .reg .b64 %rd<6>; + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_red_fused__fused_rms_norm_view_0_param_0]; + ld.param.b64 %rd5, [triton_red_fused__fused_rms_norm_view_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28 + mov.u32 %r5, %ctaid.x; + .loc 1 23 33 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33 + shl.b32 %r6, %r5, 3; + .loc 1 24 44 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44 + mov.u32 %r7, %tid.x; + and.b32 %r8, %r7, 224; + bfe.u32 %r9, %r7, 5, 3; + and.b32 %r10, %r7, 7; + .loc 1 24 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23 + or.b32 %r11, %r9, %r6; + or.b32 %r12, %r6, %r10; + .loc 1 26 37 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37 + shl.b32 %r13, %r7, 2; + and.b32 %r14, %r13, 124; + .loc 1 29 19 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19 + bfe.s32 %r15, %r5, 28, 1; + shr.u32 %r16, %r15, 27; + add.s32 %r17, %r11, %r16; + shr.u32 %r18, %r17, 5; + .loc 1 28 19 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:28:19 + and.b32 %r19, %r17, 33554400; + sub.s32 %r20, %r11, %r19; + .loc 1 38 45 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:45 + shl.b32 %r21, %r20, 7; + .loc 1 38 41 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:41 + or.b32 %r22, %r21, %r14; + .loc 1 38 50 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:50 + mad.lo.s32 %r23, %r18, 12288, %r22; + .loc 1 38 34 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34 + mad.wide.s32 %rd1, %r23, 2, %rd4; + .loc 1 38 61 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + cvt.f32.bf16 %r24, %rs1; + cvt.f32.bf16 %r25, %rs2; + cvt.f32.bf16 %r26, %rs3; + cvt.f32.bf16 %r27, %rs4; + .loc 1 40 22 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22 + mul.f32 %r28, %r25, %r25; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + fma.rn.f32 %r29, %r24, %r24, %r28; + fma.rn.f32 %r30, %r26, %r26, %r29; + fma.rn.f32 %r31, %r27, %r27, %r30; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r32, %r31, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r33, %r31, %r32; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r34, %r33, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r35, %r33, %r34; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r36, %r35, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r37, %r35, %r36; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r39, %r37, %r38; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r40, %r39, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r41, %r39, %r40; +$L__tmp12: + .loc 1 44 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28 + shr.u32 %r42, %r8, 3; + mov.b32 %r43, global_smem; + add.s32 %r44, %r43, %r42; + st.shared.b32 [%r44], %r41; + bar.sync 0; + shl.b32 %r45, %r10, 2; + add.s32 %r46, %r43, %r45; + ld.shared.b32 %r4, [%r46]; + .loc 1 45 25 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25 + mad.wide.s32 %rd3, %r12, 4, %rd5; + .loc 1 45 36 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36 + and.b32 %r47, %r7, 248; + setp.eq.b32 %p2, %r47, 0; + // begin inline asm + @%p2 st.global.b32 [ %rd3 + 0 ], { %r4 }; + // end inline asm + .loc 1 45 4 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4 + ret; +$L__tmp13: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 118 +.b8 121 +.b8 116 +.b8 52 +.b8 50 +.b8 55 +.b8 51 +.b8 105 +.b8 117 +.b8 51 +.b8 51 +.b8 109 +.b8 112 +.b8 101 +.b8 101 +.b8 55 +.b8 104 +.b8 98 +.b8 101 +.b8 116 +.b8 53 +.b8 106 +.b8 53 +.b8 101 +.b8 113 +.b8 52 +.b8 52 +.b8 100 +.b8 54 +.b8 102 +.b8 115 +.b8 104 +.b8 103 +.b8 119 +.b8 107 +.b8 121 +.b8 120 +.b8 107 +.b8 110 +.b8 53 +.b8 50 +.b8 103 +.b8 103 +.b8 103 +.b8 107 +.b8 105 +.b8 113 +.b8 104 +.b8 106 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 119 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.source b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..763d5ae6c4f5ff5e6eb3a9630f6cd5ec5350cbb9 --- /dev/null +++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 8192 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 8 : i32 loc(#loc49) + %xoffset_3 = arith.constant 8 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<8x128xi1> loc(#loc53) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c128_i32 = arith.constant 128 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<8x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<8x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<8x128xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<8x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<8x128xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<8x128xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<8x128xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<8x128xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<8x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x128xf32> loc("input"(#loc33))) -> tensor<8xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc34) + tt.return %0 : tensor<8xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xf32> loc(#loc37) + tt.return %1 : tensor<8xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..64fd00f3aff1339d633f51873800f3b13fdacf1f --- /dev/null +++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttgir @@ -0,0 +1,108 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc1 = loc(unknown) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc27 = loc("in_ptr0"(#loc)) +#loc28 = loc("out_ptr0"(#loc)) +#loc29 = loc("xnumel"(#loc)) +#loc30 = loc("r0_numel"(#loc)) +#loc49 = loc("tmp4"(#loc21)) +#loc52 = loc(callsite(#loc1 at #loc49)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc31) + %xoffset_5 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc32) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33) + %xindex_6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc33) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc33) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32, #blocked> loc(#loc34) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc34) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<8x1xi32, #blocked> loc(#loc34) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<8x1xi32, #blocked1> loc(#loc34) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35) + %x0 = arith.remsi %xindex_11, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc36) + %x1 = arith.divsi %xindex_11, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc37) + %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38) + %tmp0 = arith.muli %x0, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc39) + %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc40) + %tmp0_15 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc40) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x128xi32, #blocked> loc(#loc40) + %tmp0_17 = arith.muli %x1, %cst_1 : tensor<8x1xi32, #blocked> loc(#loc41) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc42) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<8x128xi32, #blocked> loc(#loc42) + %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked> loc(#loc43) + %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi32, #blocked> loc(#loc43) + %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc44) + %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<8x128x!tt.ptr, #blocked> loc(#loc44) + %tmp0_24 = arith.extf %tmp0_23 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc45) + %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<8x128xf32, #blocked> loc(#loc46) + %tmp5 = arith.addf %tmp2, %cst_4 : tensor<8x128xf32, #blocked> loc(#loc47) + %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc48) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))): + %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53) + tt.reduce.return %tmp4_29 : f32 loc(#loc51) + }) : (tensor<8x128xf32, #blocked>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51) + %tmp4_25 = ttg.convert_layout %tmp4 : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50) + %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc50) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked1> loc(#loc24) + %1 = tt.addptr %0, %xindex_12 : tensor<8x1x!tt.ptr, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc24) + tt.store %1, %tmp4_26 : tensor<8x1x!tt.ptr, #blocked1> loc(#loc25) + tt.return loc(#loc26) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc31 = loc("xoffset"(#loc2)) +#loc32 = loc("xoffset"(#loc3)) +#loc33 = loc("xindex"(#loc4)) +#loc34 = loc("xindex"(#loc5)) +#loc35 = loc("r0_base"(#loc6)) +#loc36 = loc("x0"(#loc7)) +#loc37 = loc("x1"(#loc8)) +#loc38 = loc("r0_mask"(#loc9)) +#loc39 = loc("tmp0"(#loc10)) +#loc40 = loc("tmp0"(#loc11)) +#loc41 = loc("tmp0"(#loc12)) +#loc42 = loc("tmp0"(#loc13)) +#loc43 = loc("tmp0"(#loc14)) +#loc44 = loc("tmp0"(#loc15)) +#loc45 = loc("tmp0"(#loc16)) +#loc46 = loc("tmp2"(#loc17)) +#loc47 = loc("tmp5"(#loc18)) +#loc48 = loc("_tmp4"(#loc19)) +#loc50 = loc("tmp4"(#loc23)) +#loc51 = loc(callsite(#loc20 at #loc49)) +#loc53 = loc(callsite(#loc22 at #loc51)) diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c61dade683e1a51730becab8945fdb2e97b38e3d --- /dev/null +++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttir @@ -0,0 +1,105 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc2 = loc(unknown) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc29 = loc("in_ptr0"(#loc)) +#loc30 = loc("out_ptr0"(#loc)) +#loc31 = loc("xnumel"(#loc)) +#loc32 = loc("r0_numel"(#loc)) +#loc53 = loc("tmp4"(#loc23)) +#loc56 = loc(callsite(#loc2 at #loc53)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16> loc(#loc33) + %cst = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc2) + %cst_0 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc2) + %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc2) + %cst_3 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc2) + %c8_i32 = arith.constant 8 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_4 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc36) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc37) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc38) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc38) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40) + %x0 = arith.remsi %xindex_7, %cst_3 : tensor<8x1xi32> loc(#loc41) + %x1 = arith.divsi %xindex_7, %cst_3 : tensor<8x1xi32> loc(#loc42) + %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43) + %tmp0_9 = arith.muli %x0, %cst_0 : tensor<8x1xi32> loc(#loc44) + %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc45) + %tmp0_11 = tt.broadcast %tmp0_9 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc45) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<8x128xi32> loc(#loc45) + %tmp0_13 = arith.muli %x1, %cst : tensor<8x1xi32> loc(#loc46) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc47) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<8x128xi32> loc(#loc47) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc48) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc48) + %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc33) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc33) + %tmp0_20 = arith.extf %tmp0_19 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc49) + %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<8x128xf32> loc(#loc50) + %tmp5 = arith.addf %tmp2, %cst_2 : tensor<8x128xf32> loc(#loc51) + %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc52) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))): + %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57) + tt.reduce.return %tmp4_24 : f32 loc(#loc55) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc55) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc54) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc26) + %1 = tt.addptr %0, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc26) + tt.store %1, %tmp4_21 : tensor<8x1x!tt.ptr> loc(#loc27) + tt.return loc(#loc28) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc33 = loc("tmp0"(#loc1)) +#loc34 = loc("xoffset"(#loc3)) +#loc35 = loc("xoffset"(#loc4)) +#loc36 = loc("xindex"(#loc5)) +#loc37 = loc("xindex"(#loc6)) +#loc38 = loc("xindex"(#loc7)) +#loc39 = loc("r0_base"(#loc8)) +#loc40 = loc("r0_base"(#loc9)) +#loc41 = loc("x0"(#loc10)) +#loc42 = loc("x1"(#loc11)) +#loc43 = loc("r0_mask"(#loc12)) +#loc44 = loc("tmp0"(#loc13)) +#loc45 = loc("tmp0"(#loc14)) +#loc46 = loc("tmp0"(#loc15)) +#loc47 = loc("tmp0"(#loc16)) +#loc48 = loc("tmp0"(#loc17)) +#loc49 = loc("tmp0"(#loc18)) +#loc50 = loc("tmp2"(#loc19)) +#loc51 = loc("tmp5"(#loc20)) +#loc52 = loc("_tmp4"(#loc21)) +#loc54 = loc("tmp4"(#loc25)) +#loc55 = loc(callsite(#loc22 at #loc53)) +#loc57 = loc(callsite(#loc24 at #loc55)) diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..967fbaa1dc2598bf06b75b858b5f1530289d91fb --- /dev/null +++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.json"}} \ No newline at end of file diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..52f4901b21e9858eb716de05eed9fd4a59e501bf Binary files /dev/null and b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.cubin differ diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.json b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a5c6a53ab457bcb3054f9470bfa06e4d66fdccec --- /dev/null +++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"hash": "236e50030a823b004916470bd0e913f39ff5fabe5609d223e91f95f6f6c36bfb", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"} \ No newline at end of file diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.llir b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..cb65d38c3937385c4545720e6794374b243fbb2f --- /dev/null +++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.llir @@ -0,0 +1,120 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 6, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 252, !dbg !9 + %11 = lshr exact i32 %10, 2, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = and i32 %9, 3, !dbg !11 + %14 = sdiv i32 %12, 32, !dbg !12 + %15 = mul i32 %14, 32, !dbg !13 + %.decomposed = sub i32 %12, %15, !dbg !13 + %16 = shl nsw i32 %.decomposed, 7, !dbg !14 + %17 = mul i32 %14, 12288, !dbg !15 + %18 = or disjoint i32 %16, %13 + %19 = add i32 %18, %17 + br label %20, !dbg !16 + +20: ; preds = %6, %20 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %20 ] + %21 = phi float [ 0.000000e+00, %6 ], [ %31, %20 ] + %22 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !17 + %23 = add i32 %19, %22, !dbg !17 + %24 = sext i32 %23 to i64, !dbg !18 + %25 = getelementptr bfloat, ptr addrspace(1) %0, i64 %24, !dbg !18 + %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %27 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %25, i64 %26, i1 true) #4, !dbg !19 + %28 = bitcast i16 %27 to bfloat, !dbg !19 + %29 = fpext bfloat %28 to float, !dbg !20 + %30 = fmul float %29, %29, !dbg !21 + %31 = fadd float %21, %30, !dbg !22 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !16 + %32 = icmp samesign ult i64 %indvars.iv, 124, !dbg !16 + br i1 %32, label %20, label %33, !dbg !16 + +33: ; preds = %20 + %34 = and i32 %9, 63, !dbg !9 + %35 = or disjoint i32 %8, %34, !dbg !10 + %36 = bitcast float %31 to i32, !dbg !23 + %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 2, i32 31), !dbg !23 + %38 = bitcast i32 %37 to float, !dbg !23 + %39 = fadd float %31, %38, !dbg !28 + %40 = bitcast float %39 to i32, !dbg !23 + %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 1, i32 31), !dbg !23 + %42 = bitcast i32 %41 to float, !dbg !23 + %43 = fadd float %39, %42, !dbg !28 + %44 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10, !dbg !29 + store float %43, ptr addrspace(3) %44, align 4, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29 + %45 = shl nuw nsw i32 %34, 2, !dbg !29 + %46 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %45, !dbg !29 + %47 = load i32, ptr addrspace(3) %46, align 4, !dbg !29 + %48 = sext i32 %35 to i64, !dbg !30 + %49 = getelementptr float, ptr addrspace(1) %1, i64 %48, !dbg !30 + %50 = and i32 %9, 192, !dbg !31 + %51 = icmp eq i32 %50, 0, !dbg !31 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %47, ptr addrspace(1) %49, i1 %51) #4, !dbg !31 + ret void, !dbg !32 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 56, scope: !4) +!16 = !DILocation(line: 32, column: 43, scope: !4) +!17 = !DILocation(line: 38, column: 50, scope: !4) +!18 = !DILocation(line: 38, column: 34, scope: !4) +!19 = !DILocation(line: 38, column: 61, scope: !4) +!20 = !DILocation(line: 38, column: 115, scope: !4) +!21 = !DILocation(line: 40, column: 22, scope: !4) +!22 = !DILocation(line: 42, column: 23, scope: !4) +!23 = !DILocation(line: 293, column: 36, scope: !24, inlinedAt: !26) +!24 = distinct !DILexicalBlockFile(scope: !4, file: !25, discriminator: 0) +!25 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!26 = !DILocation(line: 44, column: 25, scope: !27) +!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!28 = !DILocation(line: 263, column: 15, scope: !24, inlinedAt: !23) +!29 = !DILocation(line: 44, column: 28, scope: !4) +!30 = !DILocation(line: 45, column: 25, scope: !4) +!31 = !DILocation(line: 45, column: 36, scope: !4) +!32 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..482078abac6ee361f4f260d598f864b94dac4c30 --- /dev/null +++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ptx @@ -0,0 +1,486 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_0 +.visible .entry triton_red_fused__fused_rms_norm_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5 +) +.reqntid 256 +{ + .reg .pred %p<4>; + .reg .b16 %rs<3>; + .reg .b32 %r<33>; + .reg .b64 %rd<9>; + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_red_fused__fused_rms_norm_view_0_param_1]; + ld.param.b64 %rd2, [triton_red_fused__fused_rms_norm_view_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28 + mov.u32 %r4, %ctaid.x; + .loc 1 23 33 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33 + shl.b32 %r1, %r4, 6; + .loc 1 24 44 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 252; + bfe.u32 %r5, %r2, 2, 6; + .loc 1 24 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23 + or.b32 %r6, %r5, %r1; + .loc 1 26 37 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37 + and.b32 %r7, %r2, 3; + .loc 1 29 19 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19 + bfe.s32 %r8, %r4, 25, 1; + shr.u32 %r9, %r8, 27; + add.s32 %r10, %r6, %r9; + shr.u32 %r11, %r10, 5; + .loc 1 32 43 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:32:43 + add.s32 %r12, %r4, %r11; + shl.b32 %r13, %r12, 13; + shl.b32 %r14, %r5, 7; + or.b32 %r15, %r13, %r14; + or.b32 %r16, %r15, %r7; + cvt.u64.u32 %rd1, %r16; + mov.b32 %r32, 0f00000000; + mov.b64 %rd8, -4; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 38 34 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34 + add.s64 %rd6, %rd1, %rd8; + cvt.u32.u64 %r17, %rd6; + add.s32 %r18, %r17, 4; + mad.wide.s32 %rd5, %r18, 2, %rd2; + .loc 1 38 61 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0; + // end inline asm + mov.b16 %rs2, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs1 }, [ %rd5 + 0 ], %rd4; + // end inline asm + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + cvt.f32.bf16 %r19, %rs1; + .loc 1 42 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23 + fma.rn.f32 %r32, %r19, %r19, %r32; + .loc 1 32 43 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:32:43 + add.s64 %rd8, %rd8, 4; + setp.lt.u64 %p2, %rd8, 124; + @%p2 bra $L__BB0_1; +// %bb.2: + .loc 1 24 44 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44 + and.b32 %r21, %r2, 63; + .loc 1 24 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23 + or.b32 %r22, %r1, %r21; +$L__tmp1: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r23, %r32, 2, 31, -1; +$L__tmp2: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r24, %r32, %r23; +$L__tmp3: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r25, %r24, 1, 31, -1; +$L__tmp4: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r26, %r24, %r25; +$L__tmp5: + .loc 1 44 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28 + mov.b32 %r27, global_smem; + add.s32 %r28, %r27, %r3; + st.shared.b32 [%r28], %r26; + bar.sync 0; + shl.b32 %r29, %r21, 2; + add.s32 %r30, %r27, %r29; + ld.shared.b32 %r20, [%r30]; + .loc 1 45 25 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25 + mad.wide.s32 %rd7, %r22, 4, %rd3; + .loc 1 45 36 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36 + and.b32 %r31, %r2, 192; + setp.eq.b32 %p3, %r31, 0; + // begin inline asm + @%p3 st.global.b32 [ %rd7 + 0 ], { %r20 }; + // end inline asm + .loc 1 45 4 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4 + ret; +$L__tmp6: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 118 +.b8 121 +.b8 116 +.b8 52 +.b8 50 +.b8 55 +.b8 51 +.b8 105 +.b8 117 +.b8 51 +.b8 51 +.b8 109 +.b8 112 +.b8 101 +.b8 101 +.b8 55 +.b8 104 +.b8 98 +.b8 101 +.b8 116 +.b8 53 +.b8 106 +.b8 53 +.b8 101 +.b8 113 +.b8 52 +.b8 52 +.b8 100 +.b8 54 +.b8 102 +.b8 115 +.b8 104 +.b8 103 +.b8 119 +.b8 107 +.b8 121 +.b8 120 +.b8 107 +.b8 110 +.b8 53 +.b8 50 +.b8 103 +.b8 103 +.b8 103 +.b8 107 +.b8 105 +.b8 113 +.b8 104 +.b8 106 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 119 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp2 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.source b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..de27e59bc4bfffa90557e9fdd59ec98ceafc4977 --- /dev/null +++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 8192 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 64 : i32 loc(#loc49) + %xoffset_3 = arith.constant 64 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<64x4xi1> loc(#loc53) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c4_i32 = arith.constant 4 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x4xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<64x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<64x4xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<64x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<64x4xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<64x4xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<64x4xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<64x4xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<64x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x4xf32> loc("input"(#loc33))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc34) + tt.return %0 : tensor<64xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc37) + tt.return %1 : tensor<64xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..7ba4e37239c9f283234461e18806a84a05f532c6 --- /dev/null +++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir @@ -0,0 +1,121 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc1 = loc(unknown) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc30 = loc("in_ptr0"(#loc)) +#loc31 = loc("out_ptr0"(#loc)) +#loc32 = loc("xnumel"(#loc)) +#loc33 = loc("r0_numel"(#loc)) +#loc54 = loc("tmp4"(#loc24)) +#loc57 = loc(callsite(#loc1 at #loc54)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36) + %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc36) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc36) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc37) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc37) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc37) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc37) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc38) + %x0 = arith.remsi %xindex_11, %cst : tensor<64x1xi32, #blocked> loc(#loc39) + %x1 = arith.divsi %xindex_11, %cst : tensor<64x1xi32, #blocked> loc(#loc40) + %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc41) + %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc42) + %tmp0_15 = arith.muli %x1, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc43) + %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc44) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc45) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_20 = %cst_4) -> (tensor<64x4xf32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc47) + %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x4xi32, #blocked> loc(#loc47) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst_3 : tensor<1x4xi32, #blocked> loc(#loc48) + %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc42) + %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<64x4xi32, #blocked> loc(#loc42) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<64x4xi32, #blocked> loc(#loc44) + %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc45) + %tmp0_26 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc49) + %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_2 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc49) + %tmp0_28 = arith.extf %tmp0_27 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc50) + %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<64x4xf32, #blocked> loc(#loc51) + %tmp5 = arith.addf %_tmp4_20, %tmp2 : tensor<64x4xf32, #blocked> loc(#loc52) + %_tmp4_29 = arith.select %tmp0_26, %tmp5, %_tmp4_20 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc53) + scf.yield %_tmp4_29 : tensor<64x4xf32, #blocked> loc(#loc22) + } loc(#loc46) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))): + %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58) + tt.reduce.return %tmp4_22 : f32 loc(#loc56) + }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56) + %tmp4_18 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55) + %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc55) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc27) + %1 = tt.addptr %0, %xindex_12 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc27) + tt.store %1, %tmp4_19 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc28) + tt.return loc(#loc29) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc34 = loc("xoffset"(#loc2)) +#loc35 = loc("xoffset"(#loc3)) +#loc36 = loc("xindex"(#loc4)) +#loc37 = loc("xindex"(#loc5)) +#loc38 = loc("r0_base"(#loc6)) +#loc39 = loc("x0"(#loc7)) +#loc40 = loc("x1"(#loc8)) +#loc41 = loc("tmp0"(#loc9)) +#loc42 = loc("tmp0"(#loc10)) +#loc43 = loc("tmp0"(#loc11)) +#loc44 = loc("tmp0"(#loc12)) +#loc45 = loc("tmp0"(#loc13)) +#loc46 = loc("_tmp4"(#loc14)) +#loc47 = loc("r0_index"(#loc15)) +#loc48 = loc("r0_mask"(#loc16)) +#loc49 = loc("tmp0"(#loc17)) +#loc50 = loc("tmp0"(#loc18)) +#loc51 = loc("tmp2"(#loc19)) +#loc52 = loc("tmp5"(#loc20)) +#loc53 = loc("_tmp4"(#loc21)) +#loc55 = loc("tmp4"(#loc26)) +#loc56 = loc(callsite(#loc23 at #loc54)) +#loc58 = loc(callsite(#loc25 at #loc56)) diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a343a2f6287aeba89f8550cdfcb3f36d7afc9a69 --- /dev/null +++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttir @@ -0,0 +1,118 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc1 = loc(unknown) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc32 = loc("in_ptr0"(#loc)) +#loc33 = loc("out_ptr0"(#loc)) +#loc34 = loc("xnumel"(#loc)) +#loc35 = loc("r0_numel"(#loc)) +#loc58 = loc("tmp4"(#loc26)) +#loc61 = loc(callsite(#loc1 at #loc58)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc2) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<1x4xi32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc36) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc37) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc38) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc39) + %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc40) + %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc40) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc41) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc42) + %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc43) + %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc44) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_11 = %cst_3) -> (tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc46) + %r0_index_12 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc46) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x4xi32> loc(#loc47) + %tmp0 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc48) + %tmp0_13 = tt.broadcast %r0_index_12 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc49) + %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc49) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<64x4xi32> loc(#loc49) + %tmp0_16 = arith.muli %x1, %cst_0 : tensor<64x1xi32> loc(#loc50) + %tmp0_17 = tt.broadcast %tmp0_16 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc51) + %tmp0_18 = arith.addi %tmp0_15, %tmp0_17 : tensor<64x4xi32> loc(#loc51) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc52) + %tmp0_20 = tt.addptr %tmp0_19, %tmp0_18 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc52) + %tmp0_21 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc53) + %tmp0_22 = tt.load %tmp0_20, %tmp0_21, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc53) + %tmp0_23 = arith.extf %tmp0_22 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc54) + %tmp2 = arith.mulf %tmp0_23, %tmp0_23 : tensor<64x4xf32> loc(#loc55) + %tmp5 = arith.addf %_tmp4_11, %tmp2 : tensor<64x4xf32> loc(#loc56) + %_tmp4_24 = arith.select %tmp0_21, %tmp5, %_tmp4_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc57) + scf.yield %_tmp4_24 : tensor<64x4xf32> loc(#loc24) + } loc(#loc45) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_11: f32 loc(callsite(#loc1 at #loc58)), %tmp4_12: f32 loc(callsite(#loc1 at #loc58))): + %tmp4_13 = arith.addf %tmp4_11, %tmp4_12 : f32 loc(#loc62) + tt.reduce.return %tmp4_13 : f32 loc(#loc60) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc60) + %tmp4_10 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc59) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc29) + %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc29) + tt.store %1, %tmp4_10 : tensor<64x1x!tt.ptr> loc(#loc30) + tt.return loc(#loc31) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc36 = loc("xoffset"(#loc3)) +#loc37 = loc("xoffset"(#loc4)) +#loc38 = loc("xindex"(#loc5)) +#loc39 = loc("xindex"(#loc6)) +#loc40 = loc("xindex"(#loc7)) +#loc41 = loc("r0_base"(#loc8)) +#loc42 = loc("r0_base"(#loc9)) +#loc43 = loc("x0"(#loc10)) +#loc44 = loc("x1"(#loc11)) +#loc45 = loc("_tmp4"(#loc2)) +#loc46 = loc("r0_index"(#loc12)) +#loc47 = loc("r0_mask"(#loc13)) +#loc48 = loc("tmp0"(#loc14)) +#loc49 = loc("tmp0"(#loc15)) +#loc50 = loc("tmp0"(#loc16)) +#loc51 = loc("tmp0"(#loc17)) +#loc52 = loc("tmp0"(#loc18)) +#loc53 = loc("tmp0"(#loc19)) +#loc54 = loc("tmp0"(#loc20)) +#loc55 = loc("tmp2"(#loc21)) +#loc56 = loc("tmp5"(#loc22)) +#loc57 = loc("_tmp4"(#loc23)) +#loc59 = loc("tmp4"(#loc28)) +#loc60 = loc(callsite(#loc25 at #loc58)) +#loc62 = loc(callsite(#loc27 at #loc60)) diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/__grp__triton_poi_fused_add_mul_1.json b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/__grp__triton_poi_fused_add_mul_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5a6973da0a4ac172ff5cac31106b99eb6547e17f --- /dev/null +++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/__grp__triton_poi_fused_add_mul_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_mul_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.source", "triton_poi_fused_add_mul_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttir", "triton_poi_fused_add_mul_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttgir", "triton_poi_fused_add_mul_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.llir", "triton_poi_fused_add_mul_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ptx", "triton_poi_fused_add_mul_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.cubin", "triton_poi_fused_add_mul_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.json"}} \ No newline at end of file diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.cubin b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..b89626598a92bf1ac9449550d720ba4b5e158bef Binary files /dev/null and b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.cubin differ diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.json b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b4c6a19a5557b2edaccdedec3a69e4675c040cd3 --- /dev/null +++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.json @@ -0,0 +1 @@ +{"hash": "241c40e46f60eb2bf8892ac2d563a3a29372ae52e48bc84330a37c0aed2b3a93", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_1"} \ No newline at end of file diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.llir b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..1ba45455eecc075febee7d7d53941a965027b76b --- /dev/null +++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.llir @@ -0,0 +1,118 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_add_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 10, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = shl nuw nsw i32 %10, 3, !dbg !9 + %12 = and i32 %11, 1016, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = srem i32 %13, 4096, !dbg !11 + %15 = sext i32 %13 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12 + %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %16) #2, !dbg !13 + %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !13 + %19 = bitcast i32 %18 to <2 x bfloat>, !dbg !13 + %20 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !13 + %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !13 + %22 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !13 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !13 + %24 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !13 + %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !13 + %26 = sext i32 %14 to i64, !dbg !14 + %27 = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !14 + %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15 + %29 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %27, i64 %28) #2, !dbg !15 + %30 = extractvalue { i32, i32, i32, i32 } %29, 0, !dbg !15 + %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !15 + %32 = extractvalue { i32, i32, i32, i32 } %29, 1, !dbg !15 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !15 + %34 = extractvalue { i32, i32, i32, i32 } %29, 2, !dbg !15 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !15 + %36 = extractvalue { i32, i32, i32, i32 } %29, 3, !dbg !15 + %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !15 + %38 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16 + %39 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %38) #2, !dbg !17 + %40 = extractvalue { i32, i32, i32, i32 } %39, 0, !dbg !17 + %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !17 + %42 = extractvalue { i32, i32, i32, i32 } %39, 1, !dbg !17 + %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !17 + %44 = extractvalue { i32, i32, i32, i32 } %39, 2, !dbg !17 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !17 + %46 = extractvalue { i32, i32, i32, i32 } %39, 3, !dbg !17 + %47 = bitcast i32 %46 to <2 x bfloat>, !dbg !17 + %48 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18 + %49 = fpext <2 x bfloat> %19 to <2 x float>, !dbg !19 + %50 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !20 + %51 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !21 + %52 = fmul <2 x float> %50, %51, !dbg !22 + %53 = fadd <2 x float> %52, %49, !dbg !23 + %54 = fptrunc <2 x float> %53 to <2 x bfloat>, !dbg !24 + %55 = fpext <2 x bfloat> %21 to <2 x float>, !dbg !19 + %56 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !20 + %57 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !21 + %58 = fmul <2 x float> %56, %57, !dbg !22 + %59 = fadd <2 x float> %58, %55, !dbg !23 + %60 = fptrunc <2 x float> %59 to <2 x bfloat>, !dbg !24 + %61 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !19 + %62 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !20 + %63 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !21 + %64 = fmul <2 x float> %62, %63, !dbg !22 + %65 = fadd <2 x float> %64, %61, !dbg !23 + %66 = fptrunc <2 x float> %65 to <2 x bfloat>, !dbg !24 + %67 = fpext <2 x bfloat> %25 to <2 x float>, !dbg !19 + %68 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !20 + %69 = fpext <2 x bfloat> %47 to <2 x float>, !dbg !21 + %70 = fmul <2 x float> %68, %69, !dbg !22 + %71 = fadd <2 x float> %70, %67, !dbg !23 + %72 = fptrunc <2 x float> %71 to <2 x bfloat>, !dbg !24 + %73 = bitcast <2 x bfloat> %54 to i32, !dbg !24 + %74 = bitcast <2 x bfloat> %60 to i32, !dbg !24 + %75 = bitcast <2 x bfloat> %66 to i32, !dbg !24 + %76 = bitcast <2 x bfloat> %72 to i32, !dbg !24 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %73, i32 %74, i32 %75, i32 %76, ptr addrspace(1) %48) #2, !dbg !24 + ret void, !dbg !25 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_1", linkageName: "triton_poi_fused_add_mul_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 19, scope: !4) +!12 = !DILocation(line: 25, column: 30, scope: !4) +!13 = !DILocation(line: 25, column: 35, scope: !4) +!14 = !DILocation(line: 26, column: 30, scope: !4) +!15 = !DILocation(line: 26, column: 35, scope: !4) +!16 = !DILocation(line: 27, column: 30, scope: !4) +!17 = !DILocation(line: 27, column: 35, scope: !4) +!18 = !DILocation(line: 30, column: 25, scope: !4) +!19 = !DILocation(line: 25, column: 44, scope: !4) +!20 = !DILocation(line: 26, column: 74, scope: !4) +!21 = !DILocation(line: 27, column: 44, scope: !4) +!22 = !DILocation(line: 28, column: 18, scope: !4) +!23 = !DILocation(line: 29, column: 18, scope: !4) +!24 = !DILocation(line: 30, column: 36, scope: !4) +!25 = !DILocation(line: 30, column: 4, scope: !4) diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ptx b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0223689fa51d9eb5bd49ef8290f88509b10135ce --- /dev/null +++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ptx @@ -0,0 +1,407 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_add_mul_1 // -- Begin function triton_poi_fused_add_mul_1 + // @triton_poi_fused_add_mul_1 +.visible .entry triton_poi_fused_add_mul_1( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_3, + .param .u32 triton_poi_fused_add_mul_1_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_6 +) +.reqntid 128 +{ + .reg .b16 %rs<25>; + .reg .b32 %r<60>; + .reg .b64 %rd<11>; + .loc 1 18 0 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:18:0 + +// %bb.0: + ld.param.b64 %rd6, [triton_poi_fused_add_mul_1_param_0]; + ld.param.b64 %rd7, [triton_poi_fused_add_mul_1_param_1]; +$L__tmp0: + .loc 1 20 28 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:20:28 + mov.u32 %r17, %ctaid.x; + .loc 1 20 33 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:20:33 + shl.b32 %r18, %r17, 10; + ld.param.b64 %rd8, [triton_poi_fused_add_mul_1_param_2]; + ld.param.b64 %rd9, [triton_poi_fused_add_mul_1_param_3]; + .loc 1 21 36 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:21:36 + mov.u32 %r19, %tid.x; + shl.b32 %r20, %r19, 3; + and.b32 %r21, %r20, 1016; + .loc 1 21 23 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:21:23 + or.b32 %r22, %r21, %r18; + .loc 1 24 19 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:24:19 + bfe.s32 %r23, %r17, 21, 1; + shr.u32 %r24, %r23, 20; + add.s32 %r25, %r22, %r24; + and.b32 %r26, %r25, -4096; + sub.s32 %r27, %r22, %r26; + .loc 1 25 30 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:30 + mul.wide.s32 %rd10, %r22, 2; + add.s64 %rd1, %rd6, %rd10; + .loc 1 25 35 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:35 + // begin inline asm + mov.u32 %r1, 0x0; + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 26 30 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:30 + mad.wide.s32 %rd2, %r27, 2, %rd7; + .loc 1 26 35 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:35 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r5, 0x0; + mov.u32 %r6, 0x0; + mov.u32 %r7, 0x0; + mov.u32 %r8, 0x0; + ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 27 30 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:30 + add.s64 %rd4, %rd8, %rd10; + .loc 1 27 35 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:35 + // begin inline asm + mov.u32 %r9, 0x0; + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + ld.global.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd4 + 0 ]; + // end inline asm + .loc 1 30 25 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:25 + add.s64 %rd5, %rd9, %rd10; + .loc 1 25 44 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:44 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r28, %rs2; + cvt.f32.bf16 %r29, %rs1; + .loc 1 26 74 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:74 + mov.b32 {%rs3, %rs4}, %r5; + cvt.f32.bf16 %r30, %rs4; + cvt.f32.bf16 %r31, %rs3; + .loc 1 27 44 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:44 + mov.b32 {%rs5, %rs6}, %r9; + cvt.f32.bf16 %r32, %rs6; + cvt.f32.bf16 %r33, %rs5; + .loc 1 29 18 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:29:18 + fma.rn.f32 %r34, %r31, %r33, %r29; + fma.rn.f32 %r35, %r30, %r32, %r28; + .loc 1 30 36 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:36 + cvt.rn.bf16x2.f32 %r13, %r35, %r34; + .loc 1 25 44 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:44 + mov.b32 {%rs7, %rs8}, %r2; + cvt.f32.bf16 %r36, %rs8; + cvt.f32.bf16 %r37, %rs7; + .loc 1 26 74 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:74 + mov.b32 {%rs9, %rs10}, %r6; + cvt.f32.bf16 %r38, %rs10; + cvt.f32.bf16 %r39, %rs9; + .loc 1 27 44 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:44 + mov.b32 {%rs11, %rs12}, %r10; + cvt.f32.bf16 %r40, %rs12; + cvt.f32.bf16 %r41, %rs11; + .loc 1 29 18 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:29:18 + fma.rn.f32 %r42, %r39, %r41, %r37; + fma.rn.f32 %r43, %r38, %r40, %r36; + .loc 1 30 36 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:36 + cvt.rn.bf16x2.f32 %r14, %r43, %r42; + .loc 1 25 44 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:44 + mov.b32 {%rs13, %rs14}, %r3; + cvt.f32.bf16 %r44, %rs14; + cvt.f32.bf16 %r45, %rs13; + .loc 1 26 74 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:74 + mov.b32 {%rs15, %rs16}, %r7; + cvt.f32.bf16 %r46, %rs16; + cvt.f32.bf16 %r47, %rs15; + .loc 1 27 44 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:44 + mov.b32 {%rs17, %rs18}, %r11; + cvt.f32.bf16 %r48, %rs18; + cvt.f32.bf16 %r49, %rs17; + .loc 1 29 18 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:29:18 + fma.rn.f32 %r50, %r47, %r49, %r45; + fma.rn.f32 %r51, %r46, %r48, %r44; + .loc 1 30 36 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:36 + cvt.rn.bf16x2.f32 %r15, %r51, %r50; + .loc 1 25 44 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:44 + mov.b32 {%rs19, %rs20}, %r4; + cvt.f32.bf16 %r52, %rs20; + cvt.f32.bf16 %r53, %rs19; + .loc 1 26 74 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:74 + mov.b32 {%rs21, %rs22}, %r8; + cvt.f32.bf16 %r54, %rs22; + cvt.f32.bf16 %r55, %rs21; + .loc 1 27 44 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:44 + mov.b32 {%rs23, %rs24}, %r12; + cvt.f32.bf16 %r56, %rs24; + cvt.f32.bf16 %r57, %rs23; + .loc 1 29 18 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:29:18 + fma.rn.f32 %r58, %r55, %r57, %r53; + fma.rn.f32 %r59, %r54, %r56, %r52; + .loc 1 30 36 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:36 + cvt.rn.bf16x2.f32 %r16, %r59, %r58; + // begin inline asm + st.global.v4.b32 [ %rd5 + 0 ], { %r13, %r14, %r15, %r16 }; + // end inline asm + .loc 1 30 4 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 55 +.b8 102 +.b8 102 +.b8 52 +.b8 105 +.b8 98 +.b8 54 +.b8 54 +.b8 53 +.b8 50 +.b8 111 +.b8 106 +.b8 108 +.b8 108 +.b8 117 +.b8 116 +.b8 109 +.b8 52 +.b8 99 +.b8 55 +.b8 109 +.b8 107 +.b8 122 +.b8 122 +.b8 112 +.b8 121 +.b8 98 +.b8 111 +.b8 110 +.b8 100 +.b8 51 +.b8 112 +.b8 97 +.b8 103 +.b8 117 +.b8 51 +.b8 103 +.b8 108 +.b8 115 +.b8 112 +.b8 119 +.b8 51 +.b8 115 +.b8 122 +.b8 116 +.b8 107 +.b8 102 +.b8 101 +.b8 50 +.b8 122 +.b8 97 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 55 +.b8 102 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.source b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.source new file mode 100644 index 0000000000000000000000000000000000000000..9875c91d6e947ff61a2cfb4412b5ecff5ba79f09 --- /dev/null +++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.source @@ -0,0 +1,82 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0) +#loc22 = loc("in_ptr0"(#loc)) +#loc23 = loc("in_ptr1"(#loc)) +#loc24 = loc("in_ptr2"(#loc)) +#loc25 = loc("out_ptr0"(#loc)) +#loc26 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 8388608 : i32 loc(#loc27) + %xoffset = tt.get_program_id x : i32 loc(#loc28) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc29) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc29) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc30) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc31) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc31) + %xmask = arith.constant true loc(#loc32) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc32) + %x0 = arith.constant 4096 : i32 loc(#loc33) + %x0_7 = arith.constant 4096 : i32 loc(#loc33) + %x0_8 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc33) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc33) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc34) + %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc34) + %tmp0_11 = tt.load %tmp0_10 : tensor<1024x!tt.ptr> loc(#loc35) + %tmp0_12 = arith.extf %tmp0_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc37) + %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc37) + %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc38) + %tmp1_15 = arith.extf %tmp1_14 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc40) + %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc40) + %tmp2_17 = tt.load %tmp2_16 : tensor<1024x!tt.ptr> loc(#loc41) + %tmp2_18 = arith.extf %tmp2_17 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc42) + %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<1024xf32> loc(#loc43) + %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<1024xf32> loc(#loc44) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc19) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc19) + %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc20) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc20) + tt.return loc(#loc21) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4) +#loc27 = loc("xnumel"(#loc1)) +#loc28 = loc("xoffset"(#loc2)) +#loc29 = loc("xoffset"(#loc3)) +#loc30 = loc("xindex"(#loc4)) +#loc31 = loc("xindex"(#loc5)) +#loc32 = loc("xmask"(#loc6)) +#loc33 = loc("x0"(#loc7)) +#loc34 = loc("tmp0"(#loc8)) +#loc35 = loc("tmp0"(#loc9)) +#loc36 = loc("tmp0"(#loc10)) +#loc37 = loc("tmp1"(#loc11)) +#loc38 = loc("tmp1"(#loc12)) +#loc39 = loc("tmp1"(#loc13)) +#loc40 = loc("tmp2"(#loc14)) +#loc41 = loc("tmp2"(#loc15)) +#loc42 = loc("tmp2"(#loc16)) +#loc43 = loc("tmp3"(#loc17)) +#loc44 = loc("tmp4"(#loc18)) diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttgir b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2bd931d206dbaacc44db3109cd29764423901b3b --- /dev/null +++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttgir @@ -0,0 +1,74 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc26) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc27) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc28) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32, #blocked> loc(#loc29) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32, #blocked> loc(#loc29) + %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32, #blocked> loc(#loc30) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc31) + %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc31) + %tmp0_4 = tt.load %tmp0_3 : tensor<1024x!tt.ptr, #blocked> loc(#loc32) + %tmp0_5 = arith.extf %tmp0_4 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc34) + %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc34) + %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc35) + %tmp1_8 = arith.extf %tmp1_7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc37) + %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc37) + %tmp2_10 = tt.load %tmp2_9 : tensor<1024x!tt.ptr, #blocked> loc(#loc38) + %tmp2_11 = arith.extf %tmp2_10 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc39) + %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<1024xf32, #blocked> loc(#loc40) + %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<1024xf32, #blocked> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc19) + tt.store %1, %2 : tensor<1024x!tt.ptr, #blocked> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4) +#loc26 = loc("xoffset"(#loc2)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xindex"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("x0"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttir b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..cab525834fbe60a064cdc220ae6ad53066fcf24e --- /dev/null +++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttir @@ -0,0 +1,73 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %x0 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc26) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc27) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc28) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc29) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc30) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc30) + %x0_3 = arith.remsi %xindex_2, %x0 : tensor<1024xi32> loc(#loc26) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc31) + %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc31) + %tmp0_5 = tt.load %tmp0_4 : tensor<1024x!tt.ptr> loc(#loc32) + %tmp0_6 = arith.extf %tmp0_5 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc34) + %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc34) + %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc35) + %tmp1_9 = arith.extf %tmp1_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc37) + %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc37) + %tmp2_11 = tt.load %tmp2_10 : tensor<1024x!tt.ptr> loc(#loc38) + %tmp2_12 = arith.extf %tmp2_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39) + %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<1024xf32> loc(#loc40) + %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<1024xf32> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19) +#loc2 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4) +#loc26 = loc("x0"(#loc1)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xoffset"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("xindex"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b3029a86141c66d076edeed93f1ae540e3f700b5 --- /dev/null +++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.json"}} \ No newline at end of file diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..68bd792892dae5d1d40cccf5bea7af9065ebbefe Binary files /dev/null and b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.cubin differ diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..312fa5e84bd5d67a7574ac12a3ccaba6ed604438 --- /dev/null +++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"hash": "3faa6d7a76d0dd5fc76fbd5e51dfc7c8aa82497dce159b2e965670e6889e6c97", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"} \ No newline at end of file diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..f7820abb34d8dd7637be0088fdf9db884a3c3fd8 --- /dev/null +++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.llir @@ -0,0 +1,620 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { +__nv_rsqrtf.exit: + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %12 = icmp samesign ult i32 %11, 256, !dbg !9 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %14 = shl nuw nsw i32 %13, 2, !dbg !10 + %15 = and i32 %14, 2044, !dbg !10 + %16 = shl i32 %11, 12, !dbg !11 + %17 = zext nneg i32 %15 to i64, !dbg !12 + %18 = sext i32 %16 to i64, !dbg !12 + %19 = or disjoint i64 %17, %18, !dbg !13 + %20 = getelementptr bfloat, ptr addrspace(1) %0, i64 %19, !dbg !14 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !15 + %22 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %20, i64 %21, i1 %12) #6, !dbg !15 + %23 = getelementptr bfloat, ptr addrspace(1) %1, i64 %17, !dbg !16 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %25 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %23, i64 %24, i1 true) #6, !dbg !17 + %26 = getelementptr bfloat, ptr addrspace(1) %2, i64 %19, !dbg !18 + %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !19 + %28 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %26, i64 %27, i1 %12) #6, !dbg !19 + %29 = extractvalue { i32, i32 } %25, 1, !dbg !17 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !17 + %31 = extractvalue { i32, i32 } %28, 1, !dbg !19 + %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !19 + %33 = extractvalue { i32, i32 } %22, 1, !dbg !15 + %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !15 + %35 = extractvalue { i32, i32 } %25, 0, !dbg !17 + %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !17 + %37 = extractvalue { i32, i32 } %28, 0, !dbg !19 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !19 + %39 = extractvalue { i32, i32 } %22, 0, !dbg !15 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !15 + %41 = getelementptr bfloat, ptr addrspace(1) %5, i64 %19, !dbg !20 + %42 = fpext <2 x bfloat> %36 to <2 x float>, !dbg !21 + %43 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !22 + %44 = fmul <2 x float> %42, %43, !dbg !23 + %45 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !24 + %46 = fadd <2 x float> %44, %45, !dbg !25 + %47 = extractelement <2 x float> %46, i64 0, !dbg !26 + %48 = select i1 %12, float %47, float 0.000000e+00, !dbg !26 + %49 = extractelement <2 x float> %46, i64 1, !dbg !26 + %50 = select i1 %12, float %49, float 0.000000e+00, !dbg !26 + %51 = fptrunc <2 x float> %46 to <2 x bfloat>, !dbg !27 + %52 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !21 + %53 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !22 + %54 = fmul <2 x float> %52, %53, !dbg !23 + %55 = fpext <2 x bfloat> %34 to <2 x float>, !dbg !24 + %56 = fadd <2 x float> %54, %55, !dbg !25 + %57 = extractelement <2 x float> %56, i64 0, !dbg !26 + %58 = select i1 %12, float %57, float 0.000000e+00, !dbg !26 + %59 = extractelement <2 x float> %56, i64 1, !dbg !26 + %60 = select i1 %12, float %59, float 0.000000e+00, !dbg !26 + %61 = fptrunc <2 x float> %56 to <2 x bfloat>, !dbg !27 + %62 = bitcast <2 x bfloat> %51 to i32, !dbg !27 + %63 = bitcast <2 x bfloat> %61 to i32, !dbg !27 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %62, i32 %63, ptr addrspace(1) %41, i1 %12) #6, !dbg !27 + %64 = or disjoint i64 %17, 2048, !dbg !28 + %65 = or disjoint i64 %64, %18, !dbg !13 + %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %65, !dbg !14 + %67 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !15 + %68 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %66, i64 %67, i1 %12) #6, !dbg !15 + %69 = extractvalue { i32, i32 } %68, 0, !dbg !15 + %70 = bitcast i32 %69 to <2 x bfloat>, !dbg !15 + %71 = extractvalue { i32, i32 } %68, 1, !dbg !15 + %72 = bitcast i32 %71 to <2 x bfloat>, !dbg !15 + %73 = getelementptr bfloat, ptr addrspace(1) %1, i64 %64, !dbg !16 + %74 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %75 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %73, i64 %74, i1 true) #6, !dbg !17 + %76 = extractvalue { i32, i32 } %75, 0, !dbg !17 + %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !17 + %78 = extractvalue { i32, i32 } %75, 1, !dbg !17 + %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !17 + %80 = getelementptr bfloat, ptr addrspace(1) %2, i64 %65, !dbg !18 + %81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !19 + %82 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %80, i64 %81, i1 %12) #6, !dbg !19 + %83 = extractvalue { i32, i32 } %82, 0, !dbg !19 + %84 = bitcast i32 %83 to <2 x bfloat>, !dbg !19 + %85 = extractvalue { i32, i32 } %82, 1, !dbg !19 + %86 = bitcast i32 %85 to <2 x bfloat>, !dbg !19 + %87 = select i1 %12, float 2.000000e+00, float 1.000000e+00, !dbg !29 + %88 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29 + %89 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29 + %90 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29 + %91 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29 + %92 = getelementptr bfloat, ptr addrspace(1) %5, i64 %65, !dbg !20 + %93 = fpext <2 x bfloat> %70 to <2 x float>, !dbg !24 + %94 = fpext <2 x bfloat> %77 to <2 x float>, !dbg !21 + %95 = fpext <2 x bfloat> %84 to <2 x float>, !dbg !22 + %96 = fmul <2 x float> %94, %95, !dbg !23 + %97 = fadd <2 x float> %96, %93, !dbg !25 + %98 = extractelement <2 x float> %97, i64 0, !dbg !30 + %99 = fsub float %98, %48, !dbg !35 + %100 = tail call float @llvm.nvvm.div.full(float %99, float %87), !dbg !36 + %101 = fadd float %48, %100, !dbg !37 + %102 = fsub float %98, %101, !dbg !30 + %103 = fmul float %99, %102, !dbg !38 + %104 = fadd float %103, 0.000000e+00, !dbg !39 + %105 = extractelement <2 x float> %97, i64 1, !dbg !30 + %106 = fsub float %105, %50, !dbg !35 + %107 = tail call float @llvm.nvvm.div.full(float %106, float %87), !dbg !36 + %108 = fadd float %50, %107, !dbg !37 + %109 = fsub float %105, %108, !dbg !30 + %110 = fmul float %106, %109, !dbg !38 + %111 = fadd float %110, 0.000000e+00, !dbg !39 + %112 = select i1 %12, float %101, float 0.000000e+00, !dbg !26 + %113 = select i1 %12, float %108, float 0.000000e+00, !dbg !26 + %114 = fptrunc <2 x float> %97 to <2 x bfloat>, !dbg !27 + %115 = fpext <2 x bfloat> %72 to <2 x float>, !dbg !24 + %116 = fpext <2 x bfloat> %79 to <2 x float>, !dbg !21 + %117 = fpext <2 x bfloat> %86 to <2 x float>, !dbg !22 + %118 = fmul <2 x float> %116, %117, !dbg !23 + %119 = fadd <2 x float> %118, %115, !dbg !25 + %120 = extractelement <2 x float> %119, i64 0, !dbg !30 + %121 = fsub float %120, %58, !dbg !35 + %122 = tail call float @llvm.nvvm.div.full(float %121, float %87), !dbg !36 + %123 = fadd float %58, %122, !dbg !37 + %124 = fsub float %120, %123, !dbg !30 + %125 = fmul float %121, %124, !dbg !38 + %126 = fadd float %125, 0.000000e+00, !dbg !39 + %127 = extractelement <2 x float> %119, i64 1, !dbg !30 + %128 = fsub float %127, %60, !dbg !35 + %129 = tail call float @llvm.nvvm.div.full(float %128, float %87), !dbg !36 + %130 = fadd float %60, %129, !dbg !37 + %131 = fsub float %127, %130, !dbg !30 + %132 = fmul float %128, %131, !dbg !38 + %133 = fadd float %132, 0.000000e+00, !dbg !39 + %134 = select i1 %12, float %123, float 0.000000e+00, !dbg !26 + %135 = select i1 %12, float %130, float 0.000000e+00, !dbg !26 + %136 = select i1 %12, float %126, float 0.000000e+00, !dbg !40 + %137 = select i1 %12, float %133, float 0.000000e+00, !dbg !40 + %138 = fptrunc <2 x float> %119 to <2 x bfloat>, !dbg !27 + %139 = bitcast <2 x bfloat> %114 to i32, !dbg !27 + %140 = bitcast <2 x bfloat> %138 to i32, !dbg !27 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %139, i32 %140, ptr addrspace(1) %92, i1 %12) #6, !dbg !27 + %141 = and i32 %13, 511, !dbg !10 + %142 = and i32 %13, 31, !dbg !10 + %143 = lshr i32 %141, 5, !dbg !10 + %144 = fsub float %113, %112, !dbg !41 + %145 = select i1 %12, float 4.000000e+00, float 0.000000e+00, !dbg !44 + %146 = fcmp oeq float %145, 0.000000e+00, !dbg !45 + %147 = tail call float @llvm.nvvm.div.full(float %89, float %145), !dbg !46 + %148 = select i1 %146, float 0.000000e+00, float %147, !dbg !47 + %149 = fmul float %144, %148, !dbg !48 + %150 = fadd float %112, %149, !dbg !49 + %151 = fadd float %104, %111, !dbg !50 + %152 = select i1 %12, float %151, float 0.000000e+00, !dbg !50 + %153 = fmul float %144, %144, !dbg !51 + %154 = fmul float %153, %88, !dbg !52 + %155 = fmul float %154, %148, !dbg !53 + %156 = fadd float %152, %155, !dbg !54 + %157 = fsub float %134, %150, !dbg !41 + %158 = select i1 %12, float 6.000000e+00, float 0.000000e+00, !dbg !44 + %159 = fcmp oeq float %158, 0.000000e+00, !dbg !45 + %160 = tail call float @llvm.nvvm.div.full(float %90, float %158), !dbg !46 + %161 = select i1 %159, float 0.000000e+00, float %160, !dbg !47 + %162 = fmul float %161, %157, !dbg !48 + %163 = fadd float %150, %162, !dbg !49 + %164 = fadd float %136, %156, !dbg !50 + %165 = fmul float %157, %157, !dbg !51 + %166 = fmul float %145, %165, !dbg !52 + %167 = fmul float %161, %166, !dbg !53 + %168 = fadd float %164, %167, !dbg !54 + %169 = fsub float %135, %163, !dbg !41 + %170 = select i1 %12, float 8.000000e+00, float 0.000000e+00, !dbg !44 + %171 = fcmp oeq float %170, 0.000000e+00, !dbg !45 + %172 = tail call float @llvm.nvvm.div.full(float %91, float %170), !dbg !46 + %173 = select i1 %171, float 0.000000e+00, float %172, !dbg !47 + %174 = fmul float %173, %169, !dbg !48 + %175 = fadd float %163, %174, !dbg !49 + %176 = fadd float %137, %168, !dbg !50 + %177 = fmul float %169, %169, !dbg !51 + %178 = fmul float %158, %177, !dbg !52 + %179 = fmul float %173, %178, !dbg !53 + %180 = fadd float %176, %179, !dbg !54 + %181 = bitcast float %175 to i32, !dbg !42 + %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 16, i32 31), !dbg !42 + %183 = bitcast i32 %182 to float, !dbg !42 + %184 = bitcast float %180 to i32, !dbg !42 + %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 16, i32 31), !dbg !42 + %186 = bitcast i32 %185 to float, !dbg !42 + %187 = bitcast float %170 to i32, !dbg !42 + %188 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %187, i32 16, i32 31), !dbg !42 + %189 = bitcast i32 %188 to float, !dbg !42 + %190 = fsub float %183, %175, !dbg !41 + %191 = fadd float %170, %189, !dbg !44 + %192 = fcmp oeq float %191, 0.000000e+00, !dbg !45 + %193 = tail call float @llvm.nvvm.div.full(float %189, float %191), !dbg !46 + %194 = select i1 %192, float 0.000000e+00, float %193, !dbg !47 + %195 = fmul float %194, %190, !dbg !48 + %196 = fadd float %175, %195, !dbg !49 + %197 = fadd float %180, %186, !dbg !50 + %198 = fmul float %190, %190, !dbg !51 + %199 = fmul float %170, %198, !dbg !52 + %200 = fmul float %194, %199, !dbg !53 + %201 = fadd float %197, %200, !dbg !54 + %202 = bitcast float %196 to i32, !dbg !42 + %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 8, i32 31), !dbg !42 + %204 = bitcast i32 %203 to float, !dbg !42 + %205 = bitcast float %201 to i32, !dbg !42 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 8, i32 31), !dbg !42 + %207 = bitcast i32 %206 to float, !dbg !42 + %208 = bitcast float %191 to i32, !dbg !42 + %209 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %208, i32 8, i32 31), !dbg !42 + %210 = bitcast i32 %209 to float, !dbg !42 + %211 = fsub float %204, %196, !dbg !41 + %212 = fadd float %191, %210, !dbg !44 + %213 = fcmp oeq float %212, 0.000000e+00, !dbg !45 + %214 = tail call float @llvm.nvvm.div.full(float %210, float %212), !dbg !46 + %215 = select i1 %213, float 0.000000e+00, float %214, !dbg !47 + %216 = fmul float %211, %215, !dbg !48 + %217 = fadd float %196, %216, !dbg !49 + %218 = fadd float %201, %207, !dbg !50 + %219 = fmul float %211, %211, !dbg !51 + %220 = fmul float %191, %219, !dbg !52 + %221 = fmul float %215, %220, !dbg !53 + %222 = fadd float %218, %221, !dbg !54 + %223 = bitcast float %217 to i32, !dbg !42 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 4, i32 31), !dbg !42 + %225 = bitcast i32 %224 to float, !dbg !42 + %226 = bitcast float %222 to i32, !dbg !42 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 4, i32 31), !dbg !42 + %228 = bitcast i32 %227 to float, !dbg !42 + %229 = bitcast float %212 to i32, !dbg !42 + %230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 4, i32 31), !dbg !42 + %231 = bitcast i32 %230 to float, !dbg !42 + %232 = fsub float %225, %217, !dbg !41 + %233 = fadd float %212, %231, !dbg !44 + %234 = fcmp oeq float %233, 0.000000e+00, !dbg !45 + %235 = tail call float @llvm.nvvm.div.full(float %231, float %233), !dbg !46 + %236 = select i1 %234, float 0.000000e+00, float %235, !dbg !47 + %237 = fmul float %232, %236, !dbg !48 + %238 = fadd float %217, %237, !dbg !49 + %239 = fadd float %222, %228, !dbg !50 + %240 = fmul float %232, %232, !dbg !51 + %241 = fmul float %212, %240, !dbg !52 + %242 = fmul float %236, %241, !dbg !53 + %243 = fadd float %239, %242, !dbg !54 + %244 = bitcast float %238 to i32, !dbg !42 + %245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 2, i32 31), !dbg !42 + %246 = bitcast i32 %245 to float, !dbg !42 + %247 = bitcast float %243 to i32, !dbg !42 + %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 2, i32 31), !dbg !42 + %249 = bitcast i32 %248 to float, !dbg !42 + %250 = bitcast float %233 to i32, !dbg !42 + %251 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %250, i32 2, i32 31), !dbg !42 + %252 = bitcast i32 %251 to float, !dbg !42 + %253 = fsub float %246, %238, !dbg !41 + %254 = fadd float %233, %252, !dbg !44 + %255 = fcmp oeq float %254, 0.000000e+00, !dbg !45 + %256 = tail call float @llvm.nvvm.div.full(float %252, float %254), !dbg !46 + %257 = select i1 %255, float 0.000000e+00, float %256, !dbg !47 + %258 = fmul float %253, %257, !dbg !48 + %259 = fadd float %238, %258, !dbg !49 + %260 = fadd float %243, %249, !dbg !50 + %261 = fmul float %253, %253, !dbg !51 + %262 = fmul float %233, %261, !dbg !52 + %263 = fmul float %257, %262, !dbg !53 + %264 = fadd float %260, %263, !dbg !54 + %265 = bitcast float %259 to i32, !dbg !42 + %266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 1, i32 31), !dbg !42 + %267 = bitcast i32 %266 to float, !dbg !42 + %268 = bitcast float %264 to i32, !dbg !42 + %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 1, i32 31), !dbg !42 + %270 = bitcast i32 %269 to float, !dbg !42 + %271 = bitcast float %254 to i32, !dbg !42 + %272 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %271, i32 1, i32 31), !dbg !42 + %273 = bitcast i32 %272 to float, !dbg !42 + %274 = fsub float %267, %259, !dbg !41 + %275 = fadd float %254, %273, !dbg !44 + %276 = fcmp oeq float %275, 0.000000e+00, !dbg !45 + %277 = tail call float @llvm.nvvm.div.full(float %273, float %275), !dbg !46 + %278 = select i1 %276, float 0.000000e+00, float %277, !dbg !47 + %279 = fmul float %274, %278, !dbg !48 + %280 = fadd float %259, %279, !dbg !49 + %281 = fadd float %264, %270, !dbg !50 + %282 = fmul float %274, %274, !dbg !51 + %283 = fmul float %254, %282, !dbg !52 + %284 = fmul float %278, %283, !dbg !53 + %285 = fadd float %281, %284, !dbg !54 + %286 = icmp eq i32 %142, 0, !dbg !42 + %287 = getelementptr float, ptr addrspace(3) @global_smem, i32 %143, !dbg !42 + %288 = bitcast float %280 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %287, <1 x i32> %288, i1 %286) #6, !dbg !42 + %289 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %143, !dbg !42 + %290 = bitcast float %285 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %289, <1 x i32> %290, i1 %286) #6, !dbg !42 + %291 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %143, !dbg !42 + %292 = bitcast float %275 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %291, <1 x i32> %292, i1 %286) #6, !dbg !42 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42 + %293 = icmp samesign ult i32 %141, 16, !dbg !42 + %294 = getelementptr float, ptr addrspace(3) @global_smem, i32 %141, !dbg !42 + %295 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %294, i1 %293) #6, !dbg !42 + %296 = bitcast i32 %295 to float, !dbg !42 + %297 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %141, !dbg !42 + %298 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %297, i1 %293) #6, !dbg !42 + %299 = bitcast i32 %298 to float, !dbg !42 + %300 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %141, !dbg !42 + %301 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %300, i1 %293) #6, !dbg !42 + %302 = bitcast i32 %301 to float, !dbg !42 + %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %295, i32 8, i32 31), !dbg !42 + %304 = bitcast i32 %303 to float, !dbg !42 + %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 8, i32 31), !dbg !42 + %306 = bitcast i32 %305 to float, !dbg !42 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %301, i32 8, i32 31), !dbg !42 + %308 = bitcast i32 %307 to float, !dbg !42 + %309 = fsub float %304, %296, !dbg !41 + %310 = fadd float %302, %308, !dbg !44 + %311 = fcmp oeq float %310, 0.000000e+00, !dbg !45 + %312 = tail call float @llvm.nvvm.div.full(float %308, float %310), !dbg !46 + %313 = select i1 %311, float 0.000000e+00, float %312, !dbg !47 + %314 = fmul float %309, %313, !dbg !48 + %315 = fadd float %314, %296, !dbg !49 + %316 = fadd float %299, %306, !dbg !50 + %317 = fmul float %309, %309, !dbg !51 + %318 = fmul float %317, %302, !dbg !52 + %319 = fmul float %318, %313, !dbg !53 + %320 = fadd float %316, %319, !dbg !54 + %321 = bitcast float %315 to i32, !dbg !42 + %322 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 4, i32 31), !dbg !42 + %323 = bitcast i32 %322 to float, !dbg !42 + %324 = bitcast float %320 to i32, !dbg !42 + %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 4, i32 31), !dbg !42 + %326 = bitcast i32 %325 to float, !dbg !42 + %327 = bitcast float %310 to i32, !dbg !42 + %328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %327, i32 4, i32 31), !dbg !42 + %329 = bitcast i32 %328 to float, !dbg !42 + %330 = fsub float %323, %315, !dbg !41 + %331 = fadd float %310, %329, !dbg !44 + %332 = fcmp oeq float %331, 0.000000e+00, !dbg !45 + %333 = tail call float @llvm.nvvm.div.full(float %329, float %331), !dbg !46 + %334 = select i1 %332, float 0.000000e+00, float %333, !dbg !47 + %335 = fmul float %330, %334, !dbg !48 + %336 = fadd float %315, %335, !dbg !49 + %337 = fadd float %320, %326, !dbg !50 + %338 = fmul float %330, %330, !dbg !51 + %339 = fmul float %310, %338, !dbg !52 + %340 = fmul float %334, %339, !dbg !53 + %341 = fadd float %337, %340, !dbg !54 + %342 = bitcast float %336 to i32, !dbg !42 + %343 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %342, i32 2, i32 31), !dbg !42 + %344 = bitcast i32 %343 to float, !dbg !42 + %345 = bitcast float %341 to i32, !dbg !42 + %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 2, i32 31), !dbg !42 + %347 = bitcast i32 %346 to float, !dbg !42 + %348 = bitcast float %331 to i32, !dbg !42 + %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 2, i32 31), !dbg !42 + %350 = bitcast i32 %349 to float, !dbg !42 + %351 = fsub float %344, %336, !dbg !41 + %352 = fadd float %331, %350, !dbg !44 + %353 = fcmp oeq float %352, 0.000000e+00, !dbg !45 + %354 = tail call float @llvm.nvvm.div.full(float %350, float %352), !dbg !46 + %355 = select i1 %353, float 0.000000e+00, float %354, !dbg !47 + %356 = fmul float %351, %355, !dbg !48 + %357 = fadd float %336, %356, !dbg !49 + %358 = fadd float %341, %347, !dbg !50 + %359 = fmul float %351, %351, !dbg !51 + %360 = fmul float %331, %359, !dbg !52 + %361 = fmul float %355, %360, !dbg !53 + %362 = fadd float %358, %361, !dbg !54 + %363 = bitcast float %357 to i32, !dbg !42 + %364 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %363, i32 1, i32 31), !dbg !42 + %365 = bitcast i32 %364 to float, !dbg !42 + %366 = bitcast float %362 to i32, !dbg !42 + %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 1, i32 31), !dbg !42 + %368 = bitcast i32 %367 to float, !dbg !42 + %369 = bitcast float %352 to i32, !dbg !42 + %370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %369, i32 1, i32 31), !dbg !42 + %371 = bitcast i32 %370 to float, !dbg !42 + %372 = fsub float %365, %357, !dbg !41 + %373 = fadd float %352, %371, !dbg !44 + %374 = fcmp oeq float %373, 0.000000e+00, !dbg !45 + %375 = tail call float @llvm.nvvm.div.full(float %371, float %373), !dbg !46 + %376 = select i1 %374, float 0.000000e+00, float %375, !dbg !47 + %377 = fmul float %372, %376, !dbg !48 + %378 = fadd float %357, %377, !dbg !49 + %379 = fadd float %362, %368, !dbg !50 + %380 = fmul float %372, %372, !dbg !51 + %381 = fmul float %352, %380, !dbg !52 + %382 = fmul float %376, %381, !dbg !53 + %383 = fadd float %379, %382, !dbg !54 + %384 = and i32 %13, 15, !dbg !42 + %385 = icmp eq i32 %384, 0, !dbg !42 + %386 = and i1 %293, %385, !dbg !42 + %387 = bitcast float %378 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %294, <1 x i32> %387, i1 %386) #6, !dbg !42 + %388 = bitcast float %383 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %297, <1 x i32> %388, i1 %386) #6, !dbg !42 + %389 = bitcast float %373 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %300, <1 x i32> %389, i1 %386) #6, !dbg !42 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42 + %390 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !42 + %391 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !42 + %392 = tail call float @llvm.nvvm.div.full(float %391, float 4.096000e+03), !dbg !55 + %393 = fadd float %392, 0x3EB0C6F7A0000000, !dbg !56 + %394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57 + %395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57 + %396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57 + %397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57 + %.not.i15 = icmp eq i32 %397, 0, !dbg !57 + br i1 %.not.i15, label %400, label %398, !dbg !57 + +398: ; preds = %__nv_rsqrtf.exit + %399 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %393), !dbg !57 + br label %__nv_rsqrtf.exit17, !dbg !57 + +400: ; preds = %__nv_rsqrtf.exit + %401 = tail call float @llvm.nvvm.rsqrt.approx.f(float %393), !dbg !57 + br label %__nv_rsqrtf.exit17, !dbg !57 + +__nv_rsqrtf.exit17: ; preds = %398, %400 + %.0.i16 = phi float [ %399, %398 ], [ %401, %400 ], !dbg !57 + %402 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %403 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %41, i64 %402, i1 %12) #6, !dbg !58 + %404 = extractvalue { i32, i32 } %403, 0, !dbg !58 + %405 = bitcast i32 %404 to <2 x bfloat>, !dbg !58 + %406 = extractvalue { i32, i32 } %403, 1, !dbg !58 + %407 = bitcast i32 %406 to <2 x bfloat>, !dbg !58 + %408 = getelementptr bfloat, ptr addrspace(1) %3, i64 %17, !dbg !59 + %409 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !60 + %410 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %408, i64 %409, i1 true) #6, !dbg !60 + %411 = extractvalue { i32, i32 } %410, 0, !dbg !60 + %412 = bitcast i32 %411 to <2 x bfloat>, !dbg !60 + %413 = extractvalue { i32, i32 } %410, 1, !dbg !60 + %414 = bitcast i32 %413 to <2 x bfloat>, !dbg !60 + %415 = getelementptr bfloat, ptr addrspace(1) %4, i64 %17, !dbg !61 + %416 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !62 + %417 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %415, i64 %416, i1 true) #6, !dbg !62 + %418 = extractvalue { i32, i32 } %417, 0, !dbg !62 + %419 = bitcast i32 %418 to <2 x bfloat>, !dbg !62 + %420 = extractvalue { i32, i32 } %417, 1, !dbg !62 + %421 = bitcast i32 %420 to <2 x bfloat>, !dbg !62 + %422 = getelementptr bfloat, ptr addrspace(1) %6, i64 %19, !dbg !63 + %423 = fpext <2 x bfloat> %405 to <2 x float>, !dbg !64 + %424 = fpext <2 x bfloat> %412 to <2 x float>, !dbg !65 + %425 = fpext <2 x bfloat> %419 to <2 x float>, !dbg !66 + %426 = insertelement <2 x float> poison, float %390, i64 0, !dbg !67 + %427 = shufflevector <2 x float> %426, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !67 + %428 = fsub <2 x float> %423, %427, !dbg !67 + %429 = insertelement <2 x float> poison, float %.0.i16, i64 0, !dbg !68 + %430 = shufflevector <2 x float> %429, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !68 + %431 = fmul <2 x float> %430, %428, !dbg !68 + %432 = fadd <2 x float> %424, splat (float 1.000000e+00), !dbg !69 + %433 = fmul <2 x float> %431, %432, !dbg !70 + %434 = fadd <2 x float> %433, %425, !dbg !71 + %435 = fptrunc <2 x float> %434 to <2 x bfloat>, !dbg !72 + %436 = fpext <2 x bfloat> %407 to <2 x float>, !dbg !64 + %437 = fpext <2 x bfloat> %414 to <2 x float>, !dbg !65 + %438 = fpext <2 x bfloat> %421 to <2 x float>, !dbg !66 + %439 = fsub <2 x float> %436, %427, !dbg !67 + %440 = fmul <2 x float> %430, %439, !dbg !68 + %441 = fadd <2 x float> %437, splat (float 1.000000e+00), !dbg !69 + %442 = fmul <2 x float> %440, %441, !dbg !70 + %443 = fadd <2 x float> %442, %438, !dbg !71 + %444 = fptrunc <2 x float> %443 to <2 x bfloat>, !dbg !72 + %445 = bitcast <2 x bfloat> %435 to i32, !dbg !72 + %446 = bitcast <2 x bfloat> %444 to i32, !dbg !72 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %445, i32 %446, ptr addrspace(1) %422, i1 %12) #6, !dbg !72 + %447 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %448 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %92, i64 %447, i1 %12) #6, !dbg !58 + %449 = extractvalue { i32, i32 } %448, 0, !dbg !58 + %450 = bitcast i32 %449 to <2 x bfloat>, !dbg !58 + %451 = extractvalue { i32, i32 } %448, 1, !dbg !58 + %452 = bitcast i32 %451 to <2 x bfloat>, !dbg !58 + %453 = getelementptr bfloat, ptr addrspace(1) %3, i64 %64, !dbg !59 + %454 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !60 + %455 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %453, i64 %454, i1 true) #6, !dbg !60 + %456 = extractvalue { i32, i32 } %455, 0, !dbg !60 + %457 = bitcast i32 %456 to <2 x bfloat>, !dbg !60 + %458 = extractvalue { i32, i32 } %455, 1, !dbg !60 + %459 = bitcast i32 %458 to <2 x bfloat>, !dbg !60 + %460 = getelementptr bfloat, ptr addrspace(1) %4, i64 %64, !dbg !61 + %461 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !62 + %462 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %460, i64 %461, i1 true) #6, !dbg !62 + %463 = extractvalue { i32, i32 } %462, 0, !dbg !62 + %464 = bitcast i32 %463 to <2 x bfloat>, !dbg !62 + %465 = extractvalue { i32, i32 } %462, 1, !dbg !62 + %466 = bitcast i32 %465 to <2 x bfloat>, !dbg !62 + %467 = getelementptr bfloat, ptr addrspace(1) %6, i64 %65, !dbg !63 + %468 = fpext <2 x bfloat> %450 to <2 x float>, !dbg !64 + %469 = fpext <2 x bfloat> %457 to <2 x float>, !dbg !65 + %470 = fpext <2 x bfloat> %464 to <2 x float>, !dbg !66 + %471 = fsub <2 x float> %468, %427, !dbg !67 + %472 = fmul <2 x float> %430, %471, !dbg !68 + %473 = fadd <2 x float> %469, splat (float 1.000000e+00), !dbg !69 + %474 = fmul <2 x float> %472, %473, !dbg !70 + %475 = fadd <2 x float> %474, %470, !dbg !71 + %476 = fptrunc <2 x float> %475 to <2 x bfloat>, !dbg !72 + %477 = fpext <2 x bfloat> %452 to <2 x float>, !dbg !64 + %478 = fpext <2 x bfloat> %459 to <2 x float>, !dbg !65 + %479 = fpext <2 x bfloat> %466 to <2 x float>, !dbg !66 + %480 = fsub <2 x float> %477, %427, !dbg !67 + %481 = fmul <2 x float> %430, %480, !dbg !68 + %482 = fadd <2 x float> %478, splat (float 1.000000e+00), !dbg !69 + %483 = fmul <2 x float> %481, %482, !dbg !70 + %484 = fadd <2 x float> %483, %479, !dbg !71 + %485 = fptrunc <2 x float> %484 to <2 x bfloat>, !dbg !72 + %486 = bitcast <2 x bfloat> %476 to i32, !dbg !72 + %487 = bitcast <2 x bfloat> %485 to i32, !dbg !72 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %486, i32 %487, ptr addrspace(1) %467, i1 %12) #6, !dbg !72 + ret void, !dbg !73 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 25, column: 21, scope: !5) +!10 = !DILocation(line: 26, column: 37, scope: !5) +!11 = !DILocation(line: 38, column: 46, scope: !5) +!12 = !DILocation(line: 32, column: 43, scope: !5) +!13 = !DILocation(line: 38, column: 41, scope: !5) +!14 = !DILocation(line: 38, column: 34, scope: !5) +!15 = !DILocation(line: 38, column: 51, scope: !5) +!16 = !DILocation(line: 39, column: 34, scope: !5) +!17 = !DILocation(line: 39, column: 41, scope: !5) +!18 = !DILocation(line: 40, column: 34, scope: !5) +!19 = !DILocation(line: 40, column: 51, scope: !5) +!20 = !DILocation(line: 51, column: 29, scope: !5) +!21 = !DILocation(line: 39, column: 94, scope: !5) +!22 = !DILocation(line: 40, column: 113, scope: !5) +!23 = !DILocation(line: 41, column: 22, scope: !5) +!24 = !DILocation(line: 38, column: 113, scope: !5) +!25 = !DILocation(line: 42, column: 22, scope: !5) +!26 = !DILocation(line: 48, column: 62, scope: !5) +!27 = !DILocation(line: 51, column: 52, scope: !5) +!28 = !DILocation(line: 33, column: 31, scope: !5) +!29 = !DILocation(line: 50, column: 66, scope: !5) +!30 = !DILocation(line: 225, column: 39, scope: !31, inlinedAt: !33) +!31 = distinct !DILexicalBlockFile(scope: !5, file: !32, discriminator: 0) +!32 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime") +!33 = !DILocation(line: 46, column: 51, scope: !34) +!34 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!35 = !DILocation(line: 222, column: 24, scope: !31, inlinedAt: !33) +!36 = !DILocation(line: 224, column: 34, scope: !31, inlinedAt: !33) +!37 = !DILocation(line: 224, column: 26, scope: !31, inlinedAt: !33) +!38 = !DILocation(line: 225, column: 31, scope: !31, inlinedAt: !33) +!39 = !DILocation(line: 225, column: 22, scope: !31, inlinedAt: !33) +!40 = !DILocation(line: 49, column: 58, scope: !5) +!41 = !DILocation(line: 231, column: 21, scope: !31, inlinedAt: !42) +!42 = !DILocation(line: 243, column: 46, scope: !31, inlinedAt: !43) +!43 = !DILocation(line: 52, column: 80, scope: !34) +!44 = !DILocation(line: 232, column: 28, scope: !31, inlinedAt: !42) +!45 = !DILocation(line: 233, column: 39, scope: !31, inlinedAt: !42) +!46 = !DILocation(line: 233, column: 60, scope: !31, inlinedAt: !42) +!47 = !DILocation(line: 233, column: 49, scope: !31, inlinedAt: !42) +!48 = !DILocation(line: 235, column: 25, scope: !31, inlinedAt: !42) +!49 = !DILocation(line: 235, column: 17, scope: !31, inlinedAt: !42) +!50 = !DILocation(line: 236, column: 15, scope: !31, inlinedAt: !42) +!51 = !DILocation(line: 236, column: 30, scope: !31, inlinedAt: !42) +!52 = !DILocation(line: 236, column: 38, scope: !31, inlinedAt: !42) +!53 = !DILocation(line: 236, column: 49, scope: !31, inlinedAt: !42) +!54 = !DILocation(line: 236, column: 22, scope: !31, inlinedAt: !42) +!55 = !DILocation(line: 68, column: 25, scope: !5) +!56 = !DILocation(line: 70, column: 24, scope: !5) +!57 = !DILocation(line: 71, column: 32, scope: !5) +!58 = !DILocation(line: 62, column: 53, scope: !5) +!59 = !DILocation(line: 63, column: 35, scope: !5) +!60 = !DILocation(line: 63, column: 42, scope: !5) +!61 = !DILocation(line: 64, column: 35, scope: !5) +!62 = !DILocation(line: 64, column: 42, scope: !5) +!63 = !DILocation(line: 78, column: 29, scope: !5) +!64 = !DILocation(line: 62, column: 115, scope: !5) +!65 = !DILocation(line: 63, column: 95, scope: !5) +!66 = !DILocation(line: 64, column: 95, scope: !5) +!67 = !DILocation(line: 66, column: 24, scope: !5) +!68 = !DILocation(line: 72, column: 24, scope: !5) +!69 = !DILocation(line: 75, column: 24, scope: !5) +!70 = !DILocation(line: 76, column: 24, scope: !5) +!71 = !DILocation(line: 77, column: 24, scope: !5) +!72 = !DILocation(line: 78, column: 53, scope: !5) +!73 = !DILocation(line: 56, column: 4, scope: !5) diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..874348e55c4cecf9b2dd0a4905baaf3c27563c33 --- /dev/null +++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ptx @@ -0,0 +1,1191 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused_add_mul_native_layer_norm_0 +.visible .entry triton_red_fused_add_mul_native_layer_norm_0( + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_7, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_10 +) +.reqntid 512 +{ + .reg .pred %p<19>; + .reg .b16 %rs<49>; + .reg .b32 %r<317>; + .reg .b64 %rd<39>; + .loc 1 18 0 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd27, [triton_red_fused_add_mul_native_layer_norm_0_param_0]; + ld.param.b64 %rd28, [triton_red_fused_add_mul_native_layer_norm_0_param_1]; +$L__tmp0: + .loc 1 23 28 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:23:28 + mov.u32 %r49, %ctaid.x; + .loc 1 25 21 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:25:21 + setp.lt.u32 %p1, %r49, 256; + ld.param.b64 %rd29, [triton_red_fused_add_mul_native_layer_norm_0_param_2]; + ld.param.b64 %rd30, [triton_red_fused_add_mul_native_layer_norm_0_param_3]; + .loc 1 26 37 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:26:37 + mov.u32 %r50, %tid.x; + shl.b32 %r51, %r50, 2; + ld.param.b64 %rd31, [triton_red_fused_add_mul_native_layer_norm_0_param_4]; + and.b32 %r52, %r51, 2044; + ld.param.b64 %rd32, [triton_red_fused_add_mul_native_layer_norm_0_param_5]; + .loc 1 38 46 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:46 + shl.b32 %r53, %r49, 12; + ld.param.b64 %rd33, [triton_red_fused_add_mul_native_layer_norm_0_param_6]; + .loc 1 32 43 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:32:43 + cvt.u64.u32 %rd34, %r52; + cvt.s64.s32 %rd35, %r53; + .loc 1 38 41 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:41 + or.b64 %rd36, %rd34, %rd35; + .loc 1 38 34 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:34 + shl.b64 %rd37, %rd36, 1; + add.s64 %rd1, %rd27, %rd37; + .loc 1 38 51 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:51 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + .loc 1 39 34 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:34 + mul.wide.u32 %rd38, %r52, 2; + add.s64 %rd3, %rd28, %rd38; + .loc 1 39 41 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:41 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + mov.pred %p2, -1; + // begin inline asm + mov.u32 %r4, %r3; + mov.u32 %r5, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 40 34 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:34 + add.s64 %rd5, %rd29, %rd37; + .loc 1 40 51 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:51 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r3; + mov.u32 %r7, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r6, %r7 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 51 29 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:29 + add.s64 %rd7, %rd32, %rd37; + .loc 1 39 94 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94 + mov.b32 {%rs1, %rs2}, %r4; + cvt.f32.bf16 %r54, %rs1; + cvt.f32.bf16 %r55, %rs2; + .loc 1 40 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113 + mov.b32 {%rs3, %rs4}, %r6; + cvt.f32.bf16 %r56, %rs3; + cvt.f32.bf16 %r57, %rs4; + .loc 1 38 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113 + mov.b32 {%rs5, %rs6}, %r1; + cvt.f32.bf16 %r58, %rs5; + cvt.f32.bf16 %r59, %rs6; + .loc 1 42 22 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22 + fma.rn.f32 %r60, %r55, %r57, %r59; + fma.rn.f32 %r61, %r54, %r56, %r58; + .loc 1 48 62 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62 + selp.f32 %r62, %r61, 0f00000000, %p1; + selp.f32 %r63, %r60, 0f00000000, %p1; + .loc 1 51 52 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52 + cvt.rn.bf16x2.f32 %r8, %r60, %r61; + .loc 1 39 94 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94 + mov.b32 {%rs7, %rs8}, %r5; + cvt.f32.bf16 %r64, %rs7; + cvt.f32.bf16 %r65, %rs8; + .loc 1 40 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113 + mov.b32 {%rs9, %rs10}, %r7; + cvt.f32.bf16 %r66, %rs9; + cvt.f32.bf16 %r67, %rs10; + .loc 1 38 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113 + mov.b32 {%rs11, %rs12}, %r2; + cvt.f32.bf16 %r68, %rs11; + cvt.f32.bf16 %r69, %rs12; + .loc 1 42 22 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22 + fma.rn.f32 %r70, %r65, %r67, %r69; + fma.rn.f32 %r71, %r64, %r66, %r68; + .loc 1 48 62 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62 + selp.f32 %r72, %r71, 0f00000000, %p1; + selp.f32 %r73, %r70, 0f00000000, %p1; + .loc 1 51 52 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52 + cvt.rn.bf16x2.f32 %r9, %r70, %r71; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd7 + 0 ], { %r8, %r9 }; + // end inline asm + .loc 1 38 34 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:34 + add.s64 %rd8, %rd1, 4096; + .loc 1 38 51 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:51 + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r3; + mov.u32 %r11, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r10, %r11 }, [ %rd8 + 0 ], %rd9; + // end inline asm + .loc 1 39 34 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:34 + add.s64 %rd10, %rd3, 4096; + .loc 1 39 41 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:41 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r12, %r3; + mov.u32 %r13, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r12, %r13 }, [ %rd10 + 0 ], %rd11; + // end inline asm + .loc 1 40 34 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:34 + add.s64 %rd12, %rd5, 4096; + .loc 1 40 51 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:51 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd13, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r14, %r3; + mov.u32 %r15, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r14, %r15 }, [ %rd12 + 0 ], %rd13; + // end inline asm + .loc 1 50 66 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:50:66 + selp.f32 %r74, 0f40000000, 0f3F800000, %p1; + selp.f32 %r75, 0f40000000, 0f00000000, %p1; + .loc 1 51 29 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:29 + add.s64 %rd14, %rd7, 4096; + .loc 1 38 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113 + mov.b32 {%rs13, %rs14}, %r10; + cvt.f32.bf16 %r76, %rs13; + cvt.f32.bf16 %r77, %rs14; + .loc 1 39 94 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94 + mov.b32 {%rs15, %rs16}, %r12; + cvt.f32.bf16 %r78, %rs15; + cvt.f32.bf16 %r79, %rs16; + .loc 1 40 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113 + mov.b32 {%rs17, %rs18}, %r14; + cvt.f32.bf16 %r80, %rs17; + cvt.f32.bf16 %r81, %rs18; + .loc 1 42 22 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22 + fma.rn.f32 %r82, %r79, %r81, %r77; + fma.rn.f32 %r83, %r78, %r80, %r76; +$L__tmp1: + .loc 2 222 24 // triton_helpers.py:222:24 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + sub.f32 %r84, %r83, %r62; + .loc 2 224 34 // triton_helpers.py:224:34 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + div.full.f32 %r85, %r84, %r74; + .loc 2 224 26 // triton_helpers.py:224:26 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + add.f32 %r86, %r62, %r85; + .loc 2 225 39 // triton_helpers.py:225:39 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + sub.f32 %r87, %r83, %r86; + .loc 2 225 22 // triton_helpers.py:225:22 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + fma.rn.f32 %r88, %r84, %r87, 0f00000000; + .loc 2 222 24 // triton_helpers.py:222:24 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + sub.f32 %r89, %r82, %r63; + .loc 2 224 34 // triton_helpers.py:224:34 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + div.full.f32 %r90, %r89, %r74; + .loc 2 224 26 // triton_helpers.py:224:26 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + add.f32 %r91, %r63, %r90; + .loc 2 225 39 // triton_helpers.py:225:39 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + sub.f32 %r92, %r82, %r91; + .loc 2 225 22 // triton_helpers.py:225:22 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + fma.rn.f32 %r93, %r89, %r92, 0f00000000; +$L__tmp2: + .loc 1 48 62 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62 + selp.f32 %r94, %r86, 0f00000000, %p1; + selp.f32 %r95, %r91, 0f00000000, %p1; + .loc 1 51 52 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52 + cvt.rn.bf16x2.f32 %r16, %r82, %r83; + .loc 1 38 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113 + mov.b32 {%rs19, %rs20}, %r11; + cvt.f32.bf16 %r96, %rs19; + cvt.f32.bf16 %r97, %rs20; + .loc 1 39 94 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94 + mov.b32 {%rs21, %rs22}, %r13; + cvt.f32.bf16 %r98, %rs21; + cvt.f32.bf16 %r99, %rs22; + .loc 1 40 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113 + mov.b32 {%rs23, %rs24}, %r15; + cvt.f32.bf16 %r100, %rs23; + cvt.f32.bf16 %r101, %rs24; + .loc 1 42 22 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22 + fma.rn.f32 %r102, %r99, %r101, %r97; + fma.rn.f32 %r103, %r98, %r100, %r96; +$L__tmp3: + .loc 2 222 24 // triton_helpers.py:222:24 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + sub.f32 %r104, %r103, %r72; + .loc 2 224 34 // triton_helpers.py:224:34 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + div.full.f32 %r105, %r104, %r74; + .loc 2 224 26 // triton_helpers.py:224:26 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + add.f32 %r106, %r72, %r105; + .loc 2 225 39 // triton_helpers.py:225:39 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + sub.f32 %r107, %r103, %r106; + .loc 2 225 22 // triton_helpers.py:225:22 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + fma.rn.f32 %r108, %r104, %r107, 0f00000000; + .loc 2 222 24 // triton_helpers.py:222:24 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + sub.f32 %r109, %r102, %r73; + .loc 2 224 34 // triton_helpers.py:224:34 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + div.full.f32 %r110, %r109, %r74; + .loc 2 224 26 // triton_helpers.py:224:26 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + add.f32 %r111, %r73, %r110; + .loc 2 225 39 // triton_helpers.py:225:39 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + sub.f32 %r112, %r102, %r111; + .loc 2 225 22 // triton_helpers.py:225:22 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ] + fma.rn.f32 %r113, %r109, %r112, 0f00000000; +$L__tmp4: + .loc 1 48 62 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62 + selp.f32 %r114, %r106, 0f00000000, %p1; + selp.f32 %r115, %r111, 0f00000000, %p1; + .loc 1 49 58 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:49:58 + selp.f32 %r116, %r108, 0f00000000, %p1; + selp.f32 %r117, %r113, 0f00000000, %p1; + .loc 1 51 52 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52 + cvt.rn.bf16x2.f32 %r17, %r102, %r103; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd14 + 0 ], { %r16, %r17 }; + // end inline asm + .loc 1 26 37 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:26:37 + and.b32 %r118, %r50, 511; + and.b32 %r119, %r50, 31; +$L__tmp5: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r120, %r95, %r94; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r121, 0f40800000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p6, %r121, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r122, %r75, %r121; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r123, 0f00000000, %r122, %p6; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r124, %r120, %r123, %r94; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r125, %r88, %r93; + selp.f32 %r126, %r125, 0f00000000, %p1; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r127, %r120, %r120; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r128, %r127, %r75; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r129, %r128, %r123, %r126; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r130, %r114, %r124; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r131, 0f40C00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p7, %r131, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r132, %r75, %r131; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r133, 0f00000000, %r132, %p7; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r134, %r133, %r130, %r124; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r135, %r116, %r129; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r136, %r130, %r130; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r137, %r121, %r136; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r138, %r133, %r137, %r135; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r139, %r115, %r134; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r140, 0f41000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p8, %r140, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r141, %r75, %r140; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r142, 0f00000000, %r141, %p8; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r143, %r142, %r139, %r134; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r144, %r117, %r138; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r145, %r139, %r139; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r146, %r131, %r145; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r147, %r142, %r146, %r144; +$L__tmp6: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r148, %r143, 16, 31, -1; + shfl.sync.bfly.b32 %r149, %r147, 16, 31, -1; + shfl.sync.bfly.b32 %r150, %r140, 16, 31, -1; +$L__tmp7: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r151, %r148, %r143; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r152, %r140, %r150; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p9, %r152, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r153, %r150, %r152; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r154, 0f00000000, %r153, %p9; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r155, %r154, %r151, %r143; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r156, %r147, %r149; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r157, %r151, %r151; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r158, %r140, %r157; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r159, %r154, %r158, %r156; +$L__tmp8: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r160, %r155, 8, 31, -1; + shfl.sync.bfly.b32 %r161, %r159, 8, 31, -1; + shfl.sync.bfly.b32 %r162, %r152, 8, 31, -1; +$L__tmp9: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r163, %r160, %r155; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r164, %r152, %r162; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p10, %r164, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r165, %r162, %r164; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r166, 0f00000000, %r165, %p10; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r167, %r163, %r166, %r155; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r168, %r159, %r161; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r169, %r163, %r163; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r170, %r152, %r169; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r171, %r166, %r170, %r168; +$L__tmp10: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r172, %r167, 4, 31, -1; + shfl.sync.bfly.b32 %r173, %r171, 4, 31, -1; + shfl.sync.bfly.b32 %r174, %r164, 4, 31, -1; +$L__tmp11: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r175, %r172, %r167; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r176, %r164, %r174; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p11, %r176, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r177, %r174, %r176; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r178, 0f00000000, %r177, %p11; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r179, %r175, %r178, %r167; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r180, %r171, %r173; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r181, %r175, %r175; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r182, %r164, %r181; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r183, %r178, %r182, %r180; +$L__tmp12: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r184, %r179, 2, 31, -1; + shfl.sync.bfly.b32 %r185, %r183, 2, 31, -1; + shfl.sync.bfly.b32 %r186, %r176, 2, 31, -1; +$L__tmp13: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r187, %r184, %r179; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r188, %r176, %r186; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p12, %r188, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r189, %r186, %r188; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r190, 0f00000000, %r189, %p12; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r191, %r187, %r190, %r179; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r192, %r183, %r185; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r193, %r187, %r187; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r194, %r176, %r193; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r195, %r190, %r194, %r192; +$L__tmp14: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r196, %r191, 1, 31, -1; + shfl.sync.bfly.b32 %r197, %r195, 1, 31, -1; + shfl.sync.bfly.b32 %r198, %r188, 1, 31, -1; +$L__tmp15: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r199, %r196, %r191; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r23, %r188, %r198; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p13, %r23, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r200, %r198, %r23; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r201, 0f00000000, %r200, %p13; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r19, %r199, %r201, %r191; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r202, %r195, %r197; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r203, %r199, %r199; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r204, %r188, %r203; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r21, %r201, %r204, %r202; +$L__tmp16: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + setp.eq.b32 %p3, %r119, 0; + shr.u32 %r205, %r50, 3; + and.b32 %r206, %r205, 60; + mov.b32 %r207, global_smem; + add.s32 %r18, %r207, %r206; + // begin inline asm + @%p3 st.shared.b32 [ %r18 + 0 ], %r19; + // end inline asm + add.s32 %r20, %r18, 64; + // begin inline asm + @%p3 st.shared.b32 [ %r20 + 0 ], %r21; + // end inline asm + add.s32 %r22, %r18, 128; + // begin inline asm + @%p3 st.shared.b32 [ %r22 + 0 ], %r23; + // end inline asm + bar.sync 0; + setp.lt.u32 %p4, %r118, 16; + shl.b32 %r208, %r118, 2; + add.s32 %r25, %r207, %r208; + // begin inline asm + @%p4 ld.shared.b32 %r24, [ %r25 + 0 ]; + // end inline asm + add.s32 %r27, %r25, 64; + // begin inline asm + @%p4 ld.shared.b32 %r26, [ %r27 + 0 ]; + // end inline asm + add.s32 %r29, %r25, 128; + // begin inline asm + @%p4 ld.shared.b32 %r28, [ %r29 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r209, %r24, 8, 31, -1; + shfl.sync.bfly.b32 %r210, %r26, 8, 31, -1; + shfl.sync.bfly.b32 %r211, %r28, 8, 31, -1; +$L__tmp17: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r212, %r209, %r24; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r213, %r28, %r211; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p14, %r213, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r214, %r211, %r213; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r215, 0f00000000, %r214, %p14; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r216, %r212, %r215, %r24; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r217, %r26, %r210; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r218, %r212, %r212; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r219, %r218, %r28; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r220, %r219, %r215, %r217; +$L__tmp18: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r221, %r216, 4, 31, -1; + shfl.sync.bfly.b32 %r222, %r220, 4, 31, -1; + shfl.sync.bfly.b32 %r223, %r213, 4, 31, -1; +$L__tmp19: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r224, %r221, %r216; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r225, %r213, %r223; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p15, %r225, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r226, %r223, %r225; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r227, 0f00000000, %r226, %p15; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r228, %r224, %r227, %r216; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r229, %r220, %r222; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r230, %r224, %r224; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r231, %r213, %r230; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r232, %r227, %r231, %r229; +$L__tmp20: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r233, %r228, 2, 31, -1; + shfl.sync.bfly.b32 %r234, %r232, 2, 31, -1; + shfl.sync.bfly.b32 %r235, %r225, 2, 31, -1; +$L__tmp21: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r236, %r233, %r228; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r237, %r225, %r235; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p16, %r237, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r238, %r235, %r237; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r239, 0f00000000, %r238, %p16; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r240, %r236, %r239, %r228; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r241, %r232, %r234; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r242, %r236, %r236; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r243, %r225, %r242; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r244, %r239, %r243, %r241; +$L__tmp22: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r245, %r240, 1, 31, -1; + shfl.sync.bfly.b32 %r246, %r244, 1, 31, -1; + shfl.sync.bfly.b32 %r247, %r237, 1, 31, -1; +$L__tmp23: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r248, %r245, %r240; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r32, %r237, %r247; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p17, %r32, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r249, %r247, %r32; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r250, 0f00000000, %r249, %p17; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r30, %r248, %r250, %r240; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r251, %r244, %r246; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r252, %r248, %r248; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r253, %r237, %r252; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r31, %r250, %r253, %r251; +$L__tmp24: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + and.b32 %r254, %r50, 15; + setp.eq.b32 %p18, %r254, 0; + and.pred %p5, %p4, %p18; + // begin inline asm + @%p5 st.shared.b32 [ %r25 + 0 ], %r30; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r27 + 0 ], %r31; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r29 + 0 ], %r32; + // end inline asm + bar.sync 0; + ld.shared.b32 %r255, [global_smem]; + ld.shared.b32 %r256, [global_smem+64]; + mov.b32 %r257, 0f45800000; +$L__tmp25: + .loc 1 68 25 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:68:25 + div.full.f32 %r258, %r256, %r257; + .loc 1 70 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:70:24 + add.f32 %r259, %r258, 0f358637BD; + .loc 1 71 32 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:71:32 + rsqrt.approx.ftz.f32 %r260, %r259; + .loc 1 62 53 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:53 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r33, %r3; + mov.u32 %r34, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd7 + 0 ], %rd15; + // end inline asm + .loc 1 63 35 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:35 + add.s64 %rd16, %rd30, %rd38; + .loc 1 63 42 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:42 + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r35, %r3; + mov.u32 %r36, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r35, %r36 }, [ %rd16 + 0 ], %rd17; + // end inline asm + .loc 1 64 35 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:35 + add.s64 %rd18, %rd31, %rd38; + .loc 1 64 42 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:42 + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r37, %r3; + mov.u32 %r38, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r37, %r38 }, [ %rd18 + 0 ], %rd19; + // end inline asm + .loc 1 78 29 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:29 + add.s64 %rd20, %rd33, %rd37; + .loc 1 62 115 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115 + mov.b32 {%rs25, %rs26}, %r33; + cvt.f32.bf16 %r261, %rs26; + cvt.f32.bf16 %r262, %rs25; + .loc 1 63 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95 + mov.b32 {%rs27, %rs28}, %r35; + cvt.f32.bf16 %r263, %rs27; + cvt.f32.bf16 %r264, %rs28; + .loc 1 64 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95 + mov.b32 {%rs29, %rs30}, %r37; + cvt.f32.bf16 %r265, %rs30; + cvt.f32.bf16 %r266, %rs29; + .loc 1 66 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24 + sub.f32 %r267, %r262, %r255; + sub.f32 %r268, %r261, %r255; + .loc 1 72 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24 + mul.f32 %r269, %r260, %r268; + mul.f32 %r270, %r260, %r267; + .loc 1 75 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24 + add.f32 %r271, %r264, 0f3F800000; + add.f32 %r272, %r263, 0f3F800000; + .loc 1 77 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24 + fma.rn.f32 %r273, %r270, %r272, %r266; + fma.rn.f32 %r274, %r269, %r271, %r265; + .loc 1 78 53 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53 + cvt.rn.bf16x2.f32 %r39, %r274, %r273; + .loc 1 62 115 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115 + mov.b32 {%rs31, %rs32}, %r34; + cvt.f32.bf16 %r275, %rs32; + cvt.f32.bf16 %r276, %rs31; + .loc 1 63 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95 + mov.b32 {%rs33, %rs34}, %r36; + cvt.f32.bf16 %r277, %rs33; + cvt.f32.bf16 %r278, %rs34; + .loc 1 64 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95 + mov.b32 {%rs35, %rs36}, %r38; + cvt.f32.bf16 %r279, %rs36; + cvt.f32.bf16 %r280, %rs35; + .loc 1 66 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24 + sub.f32 %r281, %r276, %r255; + sub.f32 %r282, %r275, %r255; + .loc 1 72 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24 + mul.f32 %r283, %r260, %r282; + mul.f32 %r284, %r260, %r281; + .loc 1 75 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24 + add.f32 %r285, %r278, 0f3F800000; + add.f32 %r286, %r277, 0f3F800000; + .loc 1 77 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24 + fma.rn.f32 %r287, %r284, %r286, %r280; + fma.rn.f32 %r288, %r283, %r285, %r279; + .loc 1 78 53 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53 + cvt.rn.bf16x2.f32 %r40, %r288, %r287; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd20 + 0 ], { %r39, %r40 }; + // end inline asm + .loc 1 62 53 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:53 + // begin inline asm + mov.u64 %rd21, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd21, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r41, %r3; + mov.u32 %r42, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r41, %r42 }, [ %rd14 + 0 ], %rd21; + // end inline asm + .loc 1 63 35 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:35 + add.s64 %rd22, %rd16, 4096; + .loc 1 63 42 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:42 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r43, %r3; + mov.u32 %r44, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r43, %r44 }, [ %rd22 + 0 ], %rd23; + // end inline asm + .loc 1 64 35 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:35 + add.s64 %rd24, %rd18, 4096; + .loc 1 64 42 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:42 + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r45, %r3; + mov.u32 %r46, %r3; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r45, %r46 }, [ %rd24 + 0 ], %rd25; + // end inline asm + .loc 1 78 29 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:29 + add.s64 %rd26, %rd20, 4096; + .loc 1 62 115 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115 + mov.b32 {%rs37, %rs38}, %r41; + cvt.f32.bf16 %r289, %rs38; + cvt.f32.bf16 %r290, %rs37; + .loc 1 63 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95 + mov.b32 {%rs39, %rs40}, %r43; + cvt.f32.bf16 %r291, %rs39; + cvt.f32.bf16 %r292, %rs40; + .loc 1 64 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95 + mov.b32 {%rs41, %rs42}, %r45; + cvt.f32.bf16 %r293, %rs42; + cvt.f32.bf16 %r294, %rs41; + .loc 1 66 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24 + sub.f32 %r295, %r290, %r255; + sub.f32 %r296, %r289, %r255; + .loc 1 72 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24 + mul.f32 %r297, %r260, %r296; + mul.f32 %r298, %r260, %r295; + .loc 1 75 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24 + add.f32 %r299, %r292, 0f3F800000; + add.f32 %r300, %r291, 0f3F800000; + .loc 1 77 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24 + fma.rn.f32 %r301, %r298, %r300, %r294; + fma.rn.f32 %r302, %r297, %r299, %r293; + .loc 1 78 53 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53 + cvt.rn.bf16x2.f32 %r47, %r302, %r301; + .loc 1 62 115 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115 + mov.b32 {%rs43, %rs44}, %r42; + cvt.f32.bf16 %r303, %rs44; + cvt.f32.bf16 %r304, %rs43; + .loc 1 63 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95 + mov.b32 {%rs45, %rs46}, %r44; + cvt.f32.bf16 %r305, %rs45; + cvt.f32.bf16 %r306, %rs46; + .loc 1 64 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95 + mov.b32 {%rs47, %rs48}, %r46; + cvt.f32.bf16 %r307, %rs48; + cvt.f32.bf16 %r308, %rs47; + .loc 1 66 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24 + sub.f32 %r309, %r304, %r255; + sub.f32 %r310, %r303, %r255; + .loc 1 72 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24 + mul.f32 %r311, %r260, %r310; + mul.f32 %r312, %r260, %r309; + .loc 1 75 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24 + add.f32 %r313, %r306, 0f3F800000; + add.f32 %r314, %r305, 0f3F800000; + .loc 1 77 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24 + fma.rn.f32 %r315, %r312, %r314, %r308; + fma.rn.f32 %r316, %r311, %r313, %r307; + .loc 1 78 53 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53 + cvt.rn.bf16x2.f32 %r48, %r316, %r315; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd26 + 0 ], { %r47, %r48 }; + // end inline asm + .loc 1 56 4 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:56:4 + ret; +$L__tmp26: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 367 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x168 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 97 +.b8 51 +.b8 109 +.b8 101 +.b8 110 +.b8 108 +.b8 102 +.b8 117 +.b8 108 +.b8 100 +.b8 116 +.b8 104 +.b8 103 +.b8 109 +.b8 110 +.b8 99 +.b8 102 +.b8 112 +.b8 106 +.b8 107 +.b8 52 +.b8 53 +.b8 50 +.b8 120 +.b8 107 +.b8 114 +.b8 111 +.b8 115 +.b8 55 +.b8 105 +.b8 100 +.b8 114 +.b8 109 +.b8 105 +.b8 108 +.b8 54 +.b8 112 +.b8 99 +.b8 111 +.b8 101 +.b8 105 +.b8 103 +.b8 114 +.b8 97 +.b8 121 +.b8 109 +.b8 99 +.b8 103 +.b8 52 +.b8 101 +.b8 54 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 97 +.b8 51 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 97 +.b8 116 +.b8 105 +.b8 118 +.b8 101 +.b8 95 +.b8 108 +.b8 97 +.b8 121 +.b8 101 +.b8 114 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x113:0x5f DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x128:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 51 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x140:0x31 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp25 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 80 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x158:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp24 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 243 // DW_AT_call_line +.b8 46 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.source new file mode 100644 index 0000000000000000000000000000000000000000..5e6092ee3295c72d6f32f207fcd27da802f853c6 --- /dev/null +++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.source @@ -0,0 +1,486 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0) +#loc88 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0) +#loc101 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0) +#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0) +#loc107 = loc(unknown) +#loc110 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0) +#loc125 = loc("in_ptr0"(#loc)) +#loc126 = loc("in_ptr1"(#loc)) +#loc127 = loc("in_ptr2"(#loc)) +#loc128 = loc("in_ptr3"(#loc)) +#loc129 = loc("in_ptr4"(#loc)) +#loc130 = loc("out_ptr0"(#loc)) +#loc131 = loc("out_ptr3"(#loc)) +#loc132 = loc("xnumel"(#loc)) +#loc133 = loc("r0_numel"(#loc)) +#loc201 = loc("value"(#loc88)) +#loc202 = loc("mean"(#loc88)) +#loc203 = loc("m2"(#loc88)) +#loc204 = loc("weight"(#loc88)) +#loc205 = loc("first_iteration"(#loc88)) +#loc215 = loc("input"(#loc101)) +#loc216 = loc("mean"(#loc105)) +#loc217 = loc("m2"(#loc105)) +#loc218 = loc("weight"(#loc105)) +#loc219 = loc("mean_1"(#loc110)) +#loc220 = loc("m2_1"(#loc110)) +#loc221 = loc("weight_1"(#loc110)) +#loc222 = loc("mean_2"(#loc110)) +#loc223 = loc("m2_2"(#loc110)) +#loc224 = loc("weight_2"(#loc110)) +#loc231 = loc("new_mean"(#loc201)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 256 : i32 loc(#loc134) + %r0_numel_1 = arith.constant 4096 : i32 loc(#loc135) + %xoffset = tt.get_program_id x : i32 loc(#loc136) + %xoffset_2 = arith.constant 1 : i32 loc(#loc137) + %xoffset_3 = arith.constant 1 : i32 loc(#loc137) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc137) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc138) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc139) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc140) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc140) + %xmask = arith.constant dense<256> : tensor<1x1xi32> loc(#loc141) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc141) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc142) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc143) + %tmp7_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc144) + %tmp7_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc145) + %tmp7_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc146) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %tmp7_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp7_mean_13 = %tmp7_mean, %tmp7_m2_14 = %tmp7_m2, %tmp7_weight_15 = %tmp7_weight) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc148) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc148) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc149) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc149) + %tmp0 = arith.constant 4096 : i32 loc(#loc150) + %tmp0_18 = arith.constant 4096 : i32 loc(#loc150) + %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc150) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc150) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc151) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc151) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc152) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc152) + %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc153) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc153) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc154) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc154) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc154) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc154) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc155) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc156) + %tmp1_32 = tt.addptr %tmp1, %r0_index_16 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc156) + %tmp1_33 = arith.constant 0.000000e+00 : f32 loc(#loc157) + %tmp1_34 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc157) + %tmp1_35 = arith.truncf %tmp1_34 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc157) + %tmp1_36 = tt.load %tmp1_32, %r0_mask_17, %tmp1_35 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc157) + %tmp1_37 = arith.extf %tmp1_36 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc158) + %tmp2 = arith.constant 4096 : i32 loc(#loc159) + %tmp2_38 = arith.constant 4096 : i32 loc(#loc159) + %tmp2_39 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc159) + %tmp2_40 = arith.muli %tmp2_39, %xindex_7 : tensor<1x1xi32> loc(#loc159) + %tmp2_41 = tt.broadcast %tmp2_40 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc160) + %tmp2_42 = arith.addi %r0_index_16, %tmp2_41 : tensor<1x2048xi32> loc(#loc160) + %tmp2_43 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc161) + %tmp2_44 = tt.addptr %tmp2_43, %tmp2_42 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc161) + %tmp2_45 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc162) + %tmp2_46 = arith.andi %r0_mask_17, %tmp2_45 : tensor<1x2048xi1> loc(#loc162) + %tmp2_47 = arith.constant 0.000000e+00 : f32 loc(#loc163) + %tmp2_48 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc163) + %tmp2_49 = arith.truncf %tmp2_48 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc163) + %tmp2_50 = tt.load %tmp2_44, %tmp2_46, %tmp2_49 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc163) + %tmp2_51 = arith.extf %tmp2_50 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc164) + %tmp3 = arith.mulf %tmp1_37, %tmp2_51 : tensor<1x2048xf32> loc(#loc165) + %tmp4 = arith.addf %tmp0_31, %tmp3 : tensor<1x2048xf32> loc(#loc166) + %c0_i32_52 = arith.constant 0 : i32 loc(#loc34) + %9 = arith.cmpi eq, %r0_offset, %c0_i32_52 : i32 loc(#loc34) + %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%tmp4, %tmp7_mean_13, %tmp7_m2_14, %tmp7_weight_15, %9) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, i1) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc35) + %tmp7_mean_53 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc167) + %tmp7_mean_54 = arith.andi %r0_mask_17, %tmp7_mean_53 : tensor<1x2048xi1> loc(#loc167) + %tmp7_mean_55 = arith.select %tmp7_mean_54, %10#0, %tmp7_mean_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc168) + %tmp7_m2_56 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc169) + %tmp7_m2_57 = arith.andi %r0_mask_17, %tmp7_m2_56 : tensor<1x2048xi1> loc(#loc169) + %tmp7_m2_58 = arith.select %tmp7_m2_57, %10#1, %tmp7_m2_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc170) + %tmp7_weight_59 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc171) + %tmp7_weight_60 = arith.andi %r0_mask_17, %tmp7_weight_59 : tensor<1x2048xi1> loc(#loc171) + %tmp7_weight_61 = arith.select %tmp7_weight_60, %10#2, %tmp7_weight_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc172) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc42) + %c4096_i32_62 = arith.constant 4096 : i32 loc(#loc42) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc42) + %11 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc42) + %12 = tt.broadcast %11 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc43) + %13 = arith.addi %r0_index_16, %12 : tensor<1x2048xi32> loc(#loc43) + %14 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc44) + %15 = tt.addptr %14, %13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc44) + %16 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc45) + %17 = arith.andi %r0_mask_17, %16 : tensor<1x2048xi1> loc(#loc45) + %18 = arith.truncf %tmp4 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc46) + tt.store %15, %18, %17 : tensor<1x2048x!tt.ptr> loc(#loc46) + scf.yield %tmp7_mean_55, %tmp7_m2_58, %tmp7_weight_61 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc47) + } loc(#loc237) + %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%tmp7_weight_10#0, %tmp7_weight_10#1, %tmp7_weight_10#2) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc48) + %tmp7 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc173) + %tmp11 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc174) + %tmp12 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc175) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc52) + %c2048_i32_12 = arith.constant 2048 : i32 loc(#loc52) + %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc52) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc52) + %7 = arith.bitcast %c2048_i32_12 : i32 to i32 loc(#loc52) + %8 = ub.poison : i32 loc(#loc52) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc176) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc176) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc177) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc177) + %tmp13 = arith.constant 4096 : i32 loc(#loc178) + %tmp13_15 = arith.constant 4096 : i32 loc(#loc178) + %tmp13_16 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc178) + %tmp13_17 = arith.muli %tmp13_16, %xindex_7 : tensor<1x1xi32> loc(#loc178) + %tmp13_18 = tt.broadcast %tmp13_17 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc179) + %tmp13_19 = arith.addi %r0_index_13, %tmp13_18 : tensor<1x2048xi32> loc(#loc179) + %tmp13_20 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc180) + %tmp13_21 = tt.addptr %tmp13_20, %tmp13_19 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc180) + %tmp13_22 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc181) + %tmp13_23 = arith.andi %r0_mask_14, %tmp13_22 : tensor<1x2048xi1> loc(#loc181) + %tmp13_24 = arith.constant 0.000000e+00 : f32 loc(#loc182) + %tmp13_25 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc182) + %tmp13_26 = arith.truncf %tmp13_25 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc182) + %tmp13_27 = tt.load %tmp13_21, %tmp13_23, %tmp13_26 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc182) + %tmp13_28 = arith.extf %tmp13_27 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc183) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc184) + %tmp23_29 = tt.addptr %tmp23, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc184) + %tmp23_30 = arith.constant 0.000000e+00 : f32 loc(#loc185) + %tmp23_31 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc185) + %tmp23_32 = arith.truncf %tmp23_31 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc185) + %tmp23_33 = tt.load %tmp23_29, %r0_mask_14, %tmp23_32 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc185) + %tmp23_34 = arith.extf %tmp23_33 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc186) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc187) + %tmp27_35 = tt.addptr %tmp27, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc187) + %tmp27_36 = arith.constant 0.000000e+00 : f32 loc(#loc188) + %tmp27_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc188) + %tmp27_38 = arith.truncf %tmp27_37 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc188) + %tmp27_39 = tt.load %tmp27_35, %r0_mask_14, %tmp27_38 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc188) + %tmp27_40 = arith.extf %tmp27_39 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc189) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc190) + %tmp15_41 = arith.subf %tmp13_28, %tmp15 : tensor<1x2048xf32> loc(#loc190) + %tmp16 = arith.constant 4.096000e+03 : f32 loc(#loc191) + %tmp17 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc192) + %tmp17_42 = arith.divf %tmp11, %tmp17 : tensor<1x1xf32> loc(#loc192) + %tmp18 = arith.constant 9.99999997E-7 : f32 loc(#loc193) + %tmp19 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc194) + %tmp19_43 = arith.addf %tmp17_42, %tmp19 : tensor<1x1xf32> loc(#loc194) + %tmp20 = tt.extern_elementwise %tmp19_43 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc195) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc196) + %tmp21_44 = arith.mulf %tmp15_41, %tmp21 : tensor<1x2048xf32> loc(#loc196) + %tmp24 = arith.constant 1.000000e+00 : f32 loc(#loc197) + %tmp25 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc198) + %tmp25_45 = arith.addf %tmp23_34, %tmp25 : tensor<1x2048xf32> loc(#loc198) + %tmp26 = arith.mulf %tmp21_44, %tmp25_45 : tensor<1x2048xf32> loc(#loc199) + %tmp28 = arith.addf %tmp26, %tmp27_40 : tensor<1x2048xf32> loc(#loc200) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc78) + %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc78) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc78) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc78) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc79) + %11 = arith.addi %r0_index_13, %10 : tensor<1x2048xi32> loc(#loc79) + %12 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc80) + %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc80) + %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc81) + %15 = arith.andi %r0_mask_14, %14 : tensor<1x2048xi1> loc(#loc81) + %16 = arith.truncf %tmp28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc82) + tt.store %13, %16, %15 : tensor<1x2048x!tt.ptr> loc(#loc82) + } loc(#loc52) + tt.return loc(#loc83) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc85) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc85) + tt.return %cst_0 : tensor<1x2048xf32> loc(#loc86) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc87) + tt.return %0 : tensor<1x2048xf32> loc(#loc87) + } loc(#loc84) + tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%new_mean: tensor<1x2048xf32> loc("new_mean"(#loc201)), %mean: tensor<1x2048xf32> loc("mean"(#loc88)), %m2: tensor<1x2048xf32> loc("m2"(#loc88)), %weight: tensor<1x2048xf32> loc("weight"(#loc88)), %first_iteration: i1 loc("first_iteration"(#loc88))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} { + %0:3 = scf.if %first_iteration -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) { + %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc206) + %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc232) + %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_2048S__(%m2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc233) + scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc233) + } else { + %delta = arith.subf %new_mean, %mean : tensor<1x2048xf32> loc(#loc208) + %new_weight = arith.constant 1 : i32 loc(#loc209) + %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc209) + %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc209) + %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x2048xf32> loc(#loc234) + %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x2048xf32> loc(#loc210) + %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x2048xf32> loc(#loc235) + %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x2048xf32> loc(#loc212) + %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc213) + %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x2048xf32> loc(#loc236) + scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc214) + } loc(#loc89) + tt.return %0#1, %0#0, %0#2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc99) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc100) + %2 = ub.poison : tensor<1x2048xf32> loc(#loc100) + %3 = ub.poison : tensor<1x2048xf32> loc(#loc100) + tt.return %1, %2, %3 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc100) + } loc(#loc88) + tt.func private @triton.language.standard.zeros_like__fp32S1_2048S__(%input: tensor<1x2048xf32> loc("input"(#loc101))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc102) + tt.return %0 : tensor<1x2048xf32> loc(#loc103) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc104) + tt.return %1 : tensor<1x2048xf32> loc(#loc104) + } loc(#loc101) + tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%mean: tensor<1x2048xf32> loc("mean"(#loc105)), %m2: tensor<1x2048xf32> loc("m2"(#loc105)), %weight: tensor<1x2048xf32> loc("weight"(#loc105))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({ + ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): + %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc106) + tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc106) + }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc106) + tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc108) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc109) + %2 = ub.poison : tensor<1xf32> loc(#loc109) + %3 = ub.poison : tensor<1xf32> loc(#loc109) + tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc109) + } loc(#loc105) + tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc110)), %m2_1: f32 loc("m2_1"(#loc110)), %weight_1: f32 loc("weight_1"(#loc110)), %mean_2: f32 loc("mean_2"(#loc110)), %m2_2: f32 loc("m2_2"(#loc110)), %weight_2: f32 loc("weight_2"(#loc110))) -> (f32, f32, f32) attributes {noinline = false} { + %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc225) + %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc226) + %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc227) + %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc227) + %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc228) + %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc229) + %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc229) + %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc229) + %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc116) + %1 = arith.addf %mean_1, %0 : f32 loc(#loc117) + %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc118) + %3 = arith.mulf %delta, %delta : f32 loc(#loc119) + %4 = arith.mulf %3, %weight_1 : f32 loc(#loc120) + %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc121) + %6 = arith.addf %2, %5 : f32 loc(#loc122) + tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc123) + ^bb1: // no predecessors + %7 = ub.poison : f32 loc(#loc124) + %8 = ub.poison : f32 loc(#loc124) + %9 = ub.poison : f32 loc(#loc124) + tt.return %7, %8, %9 : f32, f32, f32 loc(#loc124) + } loc(#loc110) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":29:45) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":30:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":31:47) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:46) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:61) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:62) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:51) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:39) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:37) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:58) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:41) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:41) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:36) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:62) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:8) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":55:18) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:43) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":57:31) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":58:29) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:48) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:43) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:36) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:63) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":67:16) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":69:16) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":74:16) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:41) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:36) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:63) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4) +#loc84 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0) +#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31) +#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11) +#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4) +#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46) +#loc91 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11) +#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4) +#loc102 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30) +#loc103 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11) +#loc104 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4) +#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11) +#loc109 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4) +#loc111 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc112 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc113 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc114 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc115 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc116 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc117 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc118 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc119 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc120 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc121 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc122 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc123 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11) +#loc124 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4) +#loc134 = loc("xnumel"(#loc1)) +#loc135 = loc("r0_numel"(#loc2)) +#loc136 = loc("xoffset"(#loc3)) +#loc137 = loc("xoffset"(#loc4)) +#loc138 = loc("xindex"(#loc5)) +#loc139 = loc("xindex"(#loc6)) +#loc140 = loc("xindex"(#loc7)) +#loc141 = loc("xmask"(#loc8)) +#loc142 = loc("r0_base"(#loc9)) +#loc143 = loc("r0_base"(#loc10)) +#loc144 = loc("tmp7_mean"(#loc11)) +#loc145 = loc("tmp7_m2"(#loc12)) +#loc146 = loc("tmp7_weight"(#loc13)) +#loc147 = loc("tmp7_mean"(#loc14)) +#loc148 = loc("r0_index"(#loc15)) +#loc149 = loc("r0_mask"(#loc16)) +#loc150 = loc("tmp0"(#loc17)) +#loc151 = loc("tmp0"(#loc18)) +#loc152 = loc("tmp0"(#loc19)) +#loc153 = loc("tmp0"(#loc20)) +#loc154 = loc("tmp0"(#loc21)) +#loc155 = loc("tmp0"(#loc22)) +#loc156 = loc("tmp1"(#loc23)) +#loc157 = loc("tmp1"(#loc24)) +#loc158 = loc("tmp1"(#loc25)) +#loc159 = loc("tmp2"(#loc26)) +#loc160 = loc("tmp2"(#loc27)) +#loc161 = loc("tmp2"(#loc28)) +#loc162 = loc("tmp2"(#loc29)) +#loc163 = loc("tmp2"(#loc30)) +#loc164 = loc("tmp2"(#loc31)) +#loc165 = loc("tmp3"(#loc32)) +#loc166 = loc("tmp4"(#loc33)) +#loc167 = loc("tmp7_mean"(#loc36)) +#loc168 = loc("tmp7_mean"(#loc37)) +#loc169 = loc("tmp7_m2"(#loc38)) +#loc170 = loc("tmp7_m2"(#loc39)) +#loc171 = loc("tmp7_weight"(#loc40)) +#loc172 = loc("tmp7_weight"(#loc41)) +#loc173 = loc("tmp7"(#loc49)) +#loc174 = loc("tmp11"(#loc50)) +#loc175 = loc("tmp12"(#loc51)) +#loc176 = loc("r0_index"(#loc53)) +#loc177 = loc("r0_mask"(#loc54)) +#loc178 = loc("tmp13"(#loc55)) +#loc179 = loc("tmp13"(#loc56)) +#loc180 = loc("tmp13"(#loc57)) +#loc181 = loc("tmp13"(#loc58)) +#loc182 = loc("tmp13"(#loc59)) +#loc183 = loc("tmp13"(#loc60)) +#loc184 = loc("tmp23"(#loc61)) +#loc185 = loc("tmp23"(#loc62)) +#loc186 = loc("tmp23"(#loc63)) +#loc187 = loc("tmp27"(#loc64)) +#loc188 = loc("tmp27"(#loc65)) +#loc189 = loc("tmp27"(#loc66)) +#loc190 = loc("tmp15"(#loc67)) +#loc191 = loc("tmp16"(#loc68)) +#loc192 = loc("tmp17"(#loc69)) +#loc193 = loc("tmp18"(#loc70)) +#loc194 = loc("tmp19"(#loc71)) +#loc195 = loc("tmp20"(#loc72)) +#loc196 = loc("tmp21"(#loc73)) +#loc197 = loc("tmp24"(#loc74)) +#loc198 = loc("tmp25"(#loc75)) +#loc199 = loc("tmp26"(#loc76)) +#loc200 = loc("tmp28"(#loc77)) +#loc206 = loc("new_weight"(#loc90)) +#loc207 = loc("new_m2"(#loc91)) +#loc208 = loc("delta"(#loc92)) +#loc209 = loc("new_weight"(#loc93)) +#loc210 = loc("new_mean"(#loc94)) +#loc211 = loc("new_mean"(#loc95)) +#loc212 = loc("new_m2"(#loc96)) +#loc213 = loc("new_m2"(#loc97)) +#loc214 = loc("new_m2"(#loc98)) +#loc225 = loc("delta"(#loc111)) +#loc226 = loc("new_weight"(#loc112)) +#loc227 = loc("w2_over_w"(#loc113)) +#loc228 = loc("w2_over_w"(#loc114)) +#loc229 = loc("w2_over_w"(#loc115)) +#loc230 = loc("tmp7_m2"(#loc147)) +#loc232 = loc("new_weight"(#loc206)) +#loc233 = loc("new_m2"(#loc207)) +#loc234 = loc("new_weight"(#loc209)) +#loc235 = loc("new_mean"(#loc211)) +#loc236 = loc("new_m2"(#loc214)) +#loc237 = loc("tmp7_weight"(#loc230)) diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2010544834ba256a5641bc71f4f9fb3c597503d9 --- /dev/null +++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir @@ -0,0 +1,296 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0) +#loc1 = loc(unknown) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80) +#loc80 = loc("in_ptr0"(#loc)) +#loc81 = loc("in_ptr1"(#loc)) +#loc82 = loc("in_ptr2"(#loc)) +#loc83 = loc("in_ptr3"(#loc)) +#loc84 = loc("in_ptr4"(#loc)) +#loc85 = loc("out_ptr0"(#loc)) +#loc86 = loc("out_ptr3"(#loc)) +#loc87 = loc("xnumel"(#loc)) +#loc88 = loc("r0_numel"(#loc)) +#loc122 = loc(callsite(#loc1 at #loc40)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1x2048xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc89) + %xmask = arith.cmpi slt, %xoffset, %c256_i32 : i32 loc(#loc90) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc91) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc91) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc92) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc151) + %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc94) + %tmp0_9 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc152) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc96) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc97) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc11) + %tmp7_weight:3 = scf.for %tmp7_weight_10 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg10 = %cst_2, %arg11 = %cst_2, %arg12 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) : i32 { + %r0_index = tt.splat %tmp7_weight_10 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc99) + %r0_index_11 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc99) + %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc100) + %tmp0_12 = arith.addi %r0_index_11, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc93) + %tmp0_13 = tt.addptr %tmp0_8, %tmp0_12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc94) + %tmp0_14 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc95) + %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc101) + %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc102) + %tmp1_17 = tt.addptr %tmp1, %r0_index_11 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc96) + %tmp1_18 = tt.load %tmp1_17, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc103) + %tmp1_19 = arith.extf %tmp1_18 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc104) + %tmp2_20 = tt.addptr %tmp2, %tmp0_12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc97) + %tmp2_21 = tt.load %tmp2_20, %tmp0_14, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc105) + %tmp2_22 = arith.extf %tmp2_21 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc106) + %tmp3 = arith.mulf %tmp1_19, %tmp2_22 : tensor<1x2048xf32, #blocked> loc(#loc107) + %tmp4 = arith.addf %tmp0_16, %tmp3 : tensor<1x2048xf32, #blocked> loc(#loc108) + %3 = arith.cmpi eq, %tmp7_weight_10, %c0_i32 : i32 loc(#loc23) + %4:3 = scf.if %3 -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) { + scf.yield %cst_2, %tmp4, %cst_3 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc176) + } else { + %delta = arith.subf %tmp4, %arg10 : tensor<1x2048xf32, #blocked> loc(#loc155) + %new_weight = arith.addf %arg12, %cst_3 : tensor<1x2048xf32, #blocked> loc(#loc177) + %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32, #blocked> loc(#loc157) + %new_mean_24 = arith.addf %arg10, %new_mean : tensor<1x2048xf32, #blocked> loc(#loc178) + %new_m2 = arith.subf %tmp4, %new_mean_24 : tensor<1x2048xf32, #blocked> loc(#loc159) + %new_m2_25 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32, #blocked> loc(#loc160) + %new_m2_26 = arith.addf %arg11, %new_m2_25 : tensor<1x2048xf32, #blocked> loc(#loc179) + scf.yield %new_m2_26, %new_mean_24, %new_weight : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc162) + } loc(#loc109) + %tmp7_mean = arith.select %tmp0_14, %4#1, %arg10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc118) + %tmp7_m2 = arith.select %tmp0_14, %4#0, %arg11 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc119) + %tmp7_weight_23 = arith.select %tmp0_14, %4#2, %arg12 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc120) + %5 = tt.addptr %0, %tmp0_12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc11) + %6 = arith.truncf %tmp4 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc37) + tt.store %5, %6, %tmp0_14 : tensor<1x2048x!tt.ptr, #blocked> loc(#loc37) + scf.yield %tmp7_mean, %tmp7_m2, %tmp7_weight_23 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc38) + } loc(#loc175) + %1:3 = "tt.reduce"(%tmp7_weight#0, %tmp7_weight#1, %tmp7_weight#2) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc40)), %arg10: f32 loc(callsite(#loc1 at #loc40)), %arg11: f32 loc(callsite(#loc1 at #loc40)), %arg12: f32 loc(callsite(#loc1 at #loc40)), %arg13: f32 loc(callsite(#loc1 at #loc40)), %arg14: f32 loc(callsite(#loc1 at #loc40))): + %delta = arith.subf %arg12, %arg9 : f32 loc(#loc163) + %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc164) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc165) + %w2_over_w_10 = arith.divf %arg14, %new_weight : f32 loc(#loc166) + %w2_over_w_11 = arith.select %w2_over_w, %cst_1, %w2_over_w_10 : f32 loc(#loc167) + %3 = arith.mulf %delta, %w2_over_w_11 : f32 loc(#loc168) + %4 = arith.addf %arg9, %3 : f32 loc(#loc169) + %5 = arith.addf %arg10, %arg13 : f32 loc(#loc170) + %6 = arith.mulf %delta, %delta : f32 loc(#loc171) + %7 = arith.mulf %6, %arg11 : f32 loc(#loc172) + %8 = arith.mulf %7, %w2_over_w_11 : f32 loc(#loc173) + %9 = arith.addf %5, %8 : f32 loc(#loc174) + tt.reduce.return %4, %9, %new_weight : f32, f32, f32 loc(#loc121) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc121) + %tmp7 = tt.expand_dims %1#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc128) + %tmp11 = tt.expand_dims %1#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc129) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc130) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc131) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc132) + %tmp17 = arith.divf %tmp11, %cst_5 : tensor<1x1xf32, #blocked> loc(#loc133) + %tmp19 = arith.addf %tmp17, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc134) + %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc135) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc136) + %2 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc62) + scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc137) + %r0_index_10 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc137) + %r0_mask = arith.cmpi slt, %r0_index_10, %cst : tensor<1x2048xi32, #blocked> loc(#loc138) + %tmp13 = arith.addi %r0_index_10, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc139) + %tmp13_11 = tt.addptr %0, %tmp13 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc140) + %tmp13_12 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc141) + %tmp13_13 = tt.load %tmp13_11, %tmp13_12, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc142) + %tmp13_14 = arith.extf %tmp13_13 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc143) + %tmp23_15 = tt.addptr %tmp23, %r0_index_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc130) + %tmp23_16 = tt.load %tmp23_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc144) + %tmp23_17 = arith.extf %tmp23_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc145) + %tmp27_18 = tt.addptr %tmp27, %r0_index_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc131) + %tmp27_19 = tt.load %tmp27_18, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc146) + %tmp27_20 = arith.extf %tmp27_19 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc147) + %tmp15_21 = arith.subf %tmp13_14, %tmp15 : tensor<1x2048xf32, #blocked> loc(#loc132) + %tmp21_22 = arith.mulf %tmp15_21, %tmp21 : tensor<1x2048xf32, #blocked> loc(#loc136) + %tmp25 = arith.addf %tmp23_17, %cst_3 : tensor<1x2048xf32, #blocked> loc(#loc148) + %tmp26 = arith.mulf %tmp21_22, %tmp25 : tensor<1x2048xf32, #blocked> loc(#loc149) + %tmp28 = arith.addf %tmp26, %tmp27_20 : tensor<1x2048xf32, #blocked> loc(#loc150) + %3 = tt.addptr %2, %tmp13 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc62) + %4 = arith.truncf %tmp28 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc78) + tt.store %3, %4, %tmp13_12 : tensor<1x2048x!tt.ptr, #blocked> loc(#loc78) + } loc(#loc63) + tt.return loc(#loc79) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":32:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":33:31) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:62) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:51) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:58) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:8) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc44 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc45 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc46 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc47 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc48 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc49 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc50 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc51 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc52 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:43) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":57:31) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":58:29) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:43) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:36) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:63) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4) +#loc89 = loc("xoffset"(#loc2)) +#loc90 = loc("xmask"(#loc3)) +#loc91 = loc("r0_base"(#loc4)) +#loc92 = loc("tmp0"(#loc5)) +#loc93 = loc("tmp0"(#loc6)) +#loc94 = loc("tmp0"(#loc7)) +#loc95 = loc("tmp0"(#loc8)) +#loc96 = loc("tmp1"(#loc9)) +#loc97 = loc("tmp2"(#loc10)) +#loc98 = loc("tmp7_mean"(#loc12)) +#loc99 = loc("r0_index"(#loc13)) +#loc100 = loc("r0_mask"(#loc14)) +#loc101 = loc("tmp0"(#loc15)) +#loc102 = loc("tmp0"(#loc16)) +#loc103 = loc("tmp1"(#loc17)) +#loc104 = loc("tmp1"(#loc18)) +#loc105 = loc("tmp2"(#loc19)) +#loc106 = loc("tmp2"(#loc20)) +#loc107 = loc("tmp3"(#loc21)) +#loc108 = loc("tmp4"(#loc22)) +#loc109 = loc(callsite(#loc24 at #loc25)) +#loc110 = loc("new_m2"(#loc26)) +#loc111 = loc("delta"(#loc27)) +#loc112 = loc("new_weight"(#loc28)) +#loc113 = loc("new_mean"(#loc29)) +#loc114 = loc("new_mean"(#loc30)) +#loc115 = loc("new_m2"(#loc31)) +#loc116 = loc("new_m2"(#loc32)) +#loc117 = loc("new_m2"(#loc33)) +#loc118 = loc("tmp7_mean"(#loc34)) +#loc119 = loc("tmp7_m2"(#loc35)) +#loc120 = loc("tmp7_weight"(#loc36)) +#loc121 = loc(callsite(#loc39 at #loc40)) +#loc123 = loc("delta"(#loc41)) +#loc124 = loc("new_weight"(#loc42)) +#loc125 = loc("w2_over_w"(#loc43)) +#loc126 = loc("w2_over_w"(#loc44)) +#loc127 = loc("w2_over_w"(#loc45)) +#loc128 = loc("tmp7"(#loc53)) +#loc129 = loc("tmp11"(#loc54)) +#loc130 = loc("tmp23"(#loc55)) +#loc131 = loc("tmp27"(#loc56)) +#loc132 = loc("tmp15"(#loc57)) +#loc133 = loc("tmp17"(#loc58)) +#loc134 = loc("tmp19"(#loc59)) +#loc135 = loc("tmp20"(#loc60)) +#loc136 = loc("tmp21"(#loc61)) +#loc137 = loc("r0_index"(#loc64)) +#loc138 = loc("r0_mask"(#loc65)) +#loc139 = loc("tmp13"(#loc66)) +#loc140 = loc("tmp13"(#loc67)) +#loc141 = loc("tmp13"(#loc68)) +#loc142 = loc("tmp13"(#loc69)) +#loc143 = loc("tmp13"(#loc70)) +#loc144 = loc("tmp23"(#loc71)) +#loc145 = loc("tmp23"(#loc72)) +#loc146 = loc("tmp27"(#loc73)) +#loc147 = loc("tmp27"(#loc74)) +#loc148 = loc("tmp25"(#loc75)) +#loc149 = loc("tmp26"(#loc76)) +#loc150 = loc("tmp28"(#loc77)) +#loc151 = loc(fused[#loc93, #loc92]) +#loc152 = loc(fused[#loc95, #loc90]) +#loc153 = loc("tmp7_m2"(#loc98)) +#loc154 = loc("new_m2"(#loc110)) +#loc155 = loc(callsite(#loc111 at #loc25)) +#loc156 = loc("new_weight"(#loc112)) +#loc157 = loc(callsite(#loc113 at #loc25)) +#loc158 = loc("new_mean"(#loc114)) +#loc159 = loc(callsite(#loc115 at #loc25)) +#loc160 = loc(callsite(#loc116 at #loc25)) +#loc161 = loc("new_m2"(#loc117)) +#loc162 = loc(callsite(#loc117 at #loc25)) +#loc163 = loc(callsite(#loc123 at #loc121)) +#loc164 = loc(callsite(#loc124 at #loc121)) +#loc165 = loc(callsite(#loc125 at #loc121)) +#loc166 = loc(callsite(#loc126 at #loc121)) +#loc167 = loc(callsite(#loc127 at #loc121)) +#loc168 = loc(callsite(#loc46 at #loc121)) +#loc169 = loc(callsite(#loc47 at #loc121)) +#loc170 = loc(callsite(#loc48 at #loc121)) +#loc171 = loc(callsite(#loc49 at #loc121)) +#loc172 = loc(callsite(#loc50 at #loc121)) +#loc173 = loc(callsite(#loc51 at #loc121)) +#loc174 = loc(callsite(#loc52 at #loc121)) +#loc175 = loc("tmp7_weight"(#loc153)) +#loc176 = loc(callsite(#loc154 at #loc25)) +#loc177 = loc(callsite(#loc156 at #loc25)) +#loc178 = loc(callsite(#loc158 at #loc25)) +#loc179 = loc(callsite(#loc161 at #loc25)) diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..ff3805d522fe42e1a65b3b1a2d7892c8e94599f4 --- /dev/null +++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttir @@ -0,0 +1,305 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0) +#loc2 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80) +#loc82 = loc("in_ptr0"(#loc)) +#loc83 = loc("in_ptr1"(#loc)) +#loc84 = loc("in_ptr2"(#loc)) +#loc85 = loc("in_ptr3"(#loc)) +#loc86 = loc("in_ptr4"(#loc)) +#loc87 = loc("out_ptr0"(#loc)) +#loc88 = loc("out_ptr3"(#loc)) +#loc89 = loc("xnumel"(#loc)) +#loc90 = loc("r0_numel"(#loc)) +#loc92 = loc(callsite(#loc2 at #loc3)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xmask = arith.constant 256 : i32 loc(#loc91) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc92) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc2) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc2) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc2) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc2) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc2) + %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc2) + %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc2) + %cst_5 = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc93) + %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc91) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc94) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc95) + %tmp7_weight:3 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%tmp7_mean = %cst_0, %tmp7_m2 = %cst_0, %tmp7_weight_8 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc97) + %r0_index_9 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc97) + %r0_mask = arith.cmpi slt, %r0_index_9, %cst_5 : tensor<1x2048xi32> loc(#loc98) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc99) + %tmp0_10 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc156) + %tmp0_11 = arith.addi %r0_index_9, %tmp0_10 : tensor<1x2048xi32> loc(#loc100) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc101) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc101) + %tmp0_14 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc157) + %tmp0_15 = arith.andi %r0_mask, %tmp0_14 : tensor<1x2048xi1> loc(#loc102) + %tmp0_16 = tt.load %tmp0_13, %tmp0_15, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc103) + %tmp0_17 = arith.extf %tmp0_16 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc104) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc105) + %tmp1_18 = tt.addptr %tmp1, %r0_index_9 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc105) + %tmp1_19 = tt.load %tmp1_18, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc106) + %tmp1_20 = arith.extf %tmp1_19 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc107) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc108) + %tmp2_21 = tt.addptr %tmp2, %tmp0_11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc108) + %tmp2_22 = tt.load %tmp2_21, %tmp0_15, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc109) + %tmp2_23 = arith.extf %tmp2_22 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc110) + %tmp3 = arith.mulf %tmp1_20, %tmp2_23 : tensor<1x2048xf32> loc(#loc111) + %tmp4 = arith.addf %tmp0_17, %tmp3 : tensor<1x2048xf32> loc(#loc112) + %1 = arith.cmpi eq, %r0_offset, %c0_i32 : i32 loc(#loc24) + %2:3 = scf.if %1 -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) { + scf.yield %cst_0, %tmp4, %cst_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc182) + } else { + %delta = arith.subf %tmp4, %tmp7_mean : tensor<1x2048xf32> loc(#loc159) + %new_weight = arith.addf %tmp7_weight_8, %cst_2 : tensor<1x2048xf32> loc(#loc183) + %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32> loc(#loc161) + %new_mean_27 = arith.addf %tmp7_mean, %new_mean : tensor<1x2048xf32> loc(#loc184) + %new_m2 = arith.subf %tmp4, %new_mean_27 : tensor<1x2048xf32> loc(#loc163) + %new_m2_28 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc164) + %new_m2_29 = arith.addf %tmp7_m2, %new_m2_28 : tensor<1x2048xf32> loc(#loc185) + scf.yield %new_m2_29, %new_mean_27, %new_weight : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc166) + } loc(#loc113) + %tmp7_mean_24 = arith.select %tmp0_15, %2#1, %tmp7_mean : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc122) + %tmp7_m2_25 = arith.select %tmp0_15, %2#0, %tmp7_m2 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc123) + %tmp7_weight_26 = arith.select %tmp0_15, %2#2, %tmp7_weight_8 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc124) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc38) + %4 = tt.addptr %3, %tmp0_11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc38) + %5 = arith.truncf %tmp4 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc39) + tt.store %4, %5, %tmp0_15 : tensor<1x2048x!tt.ptr> loc(#loc39) + scf.yield %tmp7_mean_24, %tmp7_m2_25, %tmp7_weight_26 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc40) + } loc(#loc181) + %0:3 = "tt.reduce"(%tmp7_weight#0, %tmp7_weight#1, %tmp7_weight#2) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32 loc(callsite(#loc2 at #loc3)), %arg10: f32 loc(callsite(#loc2 at #loc3)), %arg11: f32 loc(callsite(#loc2 at #loc3)), %arg12: f32 loc(callsite(#loc2 at #loc3)), %arg13: f32 loc(callsite(#loc2 at #loc3)), %arg14: f32 loc(callsite(#loc2 at #loc3))): + %delta = arith.subf %arg12, %arg9 : f32 loc(#loc167) + %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc168) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc169) + %w2_over_w_8 = arith.divf %arg14, %new_weight : f32 loc(#loc170) + %w2_over_w_9 = arith.select %w2_over_w, %cst, %w2_over_w_8 : f32 loc(#loc171) + %1 = arith.mulf %delta, %w2_over_w_9 : f32 loc(#loc172) + %2 = arith.addf %arg9, %1 : f32 loc(#loc173) + %3 = arith.addf %arg10, %arg13 : f32 loc(#loc174) + %4 = arith.mulf %delta, %delta : f32 loc(#loc175) + %5 = arith.mulf %4, %arg11 : f32 loc(#loc176) + %6 = arith.mulf %5, %w2_over_w_9 : f32 loc(#loc177) + %7 = arith.addf %3, %6 : f32 loc(#loc178) + tt.reduce.return %2, %7, %new_weight : f32, f32, f32 loc(#loc125) + }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc125) + %tmp7 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc131) + %tmp11 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc132) + scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc133) + %r0_index_8 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc133) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst_5 : tensor<1x2048xi32> loc(#loc134) + %tmp13 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc135) + %tmp13_9 = tt.splat %tmp13 : i32 -> tensor<1x2048xi32> loc(#loc179) + %tmp13_10 = arith.addi %r0_index_8, %tmp13_9 : tensor<1x2048xi32> loc(#loc136) + %tmp13_11 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc137) + %tmp13_12 = tt.addptr %tmp13_11, %tmp13_10 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc137) + %tmp13_13 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc180) + %tmp13_14 = arith.andi %r0_mask, %tmp13_13 : tensor<1x2048xi1> loc(#loc138) + %tmp13_15 = tt.load %tmp13_12, %tmp13_14, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc139) + %tmp13_16 = arith.extf %tmp13_15 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc140) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc141) + %tmp23_17 = tt.addptr %tmp23, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc141) + %tmp23_18 = tt.load %tmp23_17, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc142) + %tmp23_19 = arith.extf %tmp23_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc143) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc144) + %tmp27_20 = tt.addptr %tmp27, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc144) + %tmp27_21 = tt.load %tmp27_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc145) + %tmp27_22 = arith.extf %tmp27_21 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc146) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc147) + %tmp15_23 = arith.subf %tmp13_16, %tmp15 : tensor<1x2048xf32> loc(#loc147) + %tmp17 = arith.divf %tmp11, %cst_4 : tensor<1x1xf32> loc(#loc148) + %tmp19 = arith.addf %tmp17, %cst_3 : tensor<1x1xf32> loc(#loc149) + %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc150) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc151) + %tmp21_24 = arith.mulf %tmp15_23, %tmp21 : tensor<1x2048xf32> loc(#loc151) + %tmp25 = arith.addf %tmp23_19, %cst_2 : tensor<1x2048xf32> loc(#loc152) + %tmp26 = arith.mulf %tmp21_24, %tmp25 : tensor<1x2048xf32> loc(#loc153) + %tmp28 = arith.addf %tmp26, %tmp27_22 : tensor<1x2048xf32> loc(#loc154) + %1 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc79) + %2 = tt.addptr %1, %tmp13_10 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc79) + %3 = arith.truncf %tmp28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc80) + tt.store %2, %3, %tmp13_14 : tensor<1x2048x!tt.ptr> loc(#loc80) + } loc(#loc56) + tt.return loc(#loc81) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:27) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":32:43) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":33:31) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:62) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:51) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:58) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:8) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc44 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc45 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc46 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc47 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc48 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc49 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc50 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc51 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc52 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc53 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:43) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":57:31) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":58:29) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:48) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:43) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:36) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:63) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4) +#loc91 = loc("xmask"(#loc1)) +#loc93 = loc("xoffset"(#loc4)) +#loc94 = loc("r0_base"(#loc5)) +#loc95 = loc("r0_base"(#loc6)) +#loc96 = loc("tmp7_mean"(#loc7)) +#loc97 = loc("r0_index"(#loc8)) +#loc98 = loc("r0_mask"(#loc9)) +#loc99 = loc("tmp0"(#loc10)) +#loc100 = loc("tmp0"(#loc11)) +#loc101 = loc("tmp0"(#loc12)) +#loc102 = loc("tmp0"(#loc13)) +#loc103 = loc("tmp0"(#loc14)) +#loc104 = loc("tmp0"(#loc15)) +#loc105 = loc("tmp1"(#loc16)) +#loc106 = loc("tmp1"(#loc17)) +#loc107 = loc("tmp1"(#loc18)) +#loc108 = loc("tmp2"(#loc19)) +#loc109 = loc("tmp2"(#loc20)) +#loc110 = loc("tmp2"(#loc21)) +#loc111 = loc("tmp3"(#loc22)) +#loc112 = loc("tmp4"(#loc23)) +#loc113 = loc(callsite(#loc25 at #loc26)) +#loc114 = loc("new_m2"(#loc27)) +#loc115 = loc("delta"(#loc28)) +#loc116 = loc("new_weight"(#loc29)) +#loc117 = loc("new_mean"(#loc30)) +#loc118 = loc("new_mean"(#loc31)) +#loc119 = loc("new_m2"(#loc32)) +#loc120 = loc("new_m2"(#loc33)) +#loc121 = loc("new_m2"(#loc34)) +#loc122 = loc("tmp7_mean"(#loc35)) +#loc123 = loc("tmp7_m2"(#loc36)) +#loc124 = loc("tmp7_weight"(#loc37)) +#loc125 = loc(callsite(#loc41 at #loc3)) +#loc126 = loc("delta"(#loc42)) +#loc127 = loc("new_weight"(#loc43)) +#loc128 = loc("w2_over_w"(#loc44)) +#loc129 = loc("w2_over_w"(#loc45)) +#loc130 = loc("w2_over_w"(#loc46)) +#loc131 = loc("tmp7"(#loc54)) +#loc132 = loc("tmp11"(#loc55)) +#loc133 = loc("r0_index"(#loc57)) +#loc134 = loc("r0_mask"(#loc58)) +#loc135 = loc("tmp13"(#loc59)) +#loc136 = loc("tmp13"(#loc60)) +#loc137 = loc("tmp13"(#loc61)) +#loc138 = loc("tmp13"(#loc62)) +#loc139 = loc("tmp13"(#loc63)) +#loc140 = loc("tmp13"(#loc64)) +#loc141 = loc("tmp23"(#loc65)) +#loc142 = loc("tmp23"(#loc66)) +#loc143 = loc("tmp23"(#loc67)) +#loc144 = loc("tmp27"(#loc68)) +#loc145 = loc("tmp27"(#loc69)) +#loc146 = loc("tmp27"(#loc70)) +#loc147 = loc("tmp15"(#loc71)) +#loc148 = loc("tmp17"(#loc72)) +#loc149 = loc("tmp19"(#loc73)) +#loc150 = loc("tmp20"(#loc74)) +#loc151 = loc("tmp21"(#loc75)) +#loc152 = loc("tmp25"(#loc76)) +#loc153 = loc("tmp26"(#loc77)) +#loc154 = loc("tmp28"(#loc78)) +#loc155 = loc("tmp7_m2"(#loc96)) +#loc156 = loc(fused[#loc100, #loc99]) +#loc157 = loc(fused[#loc102, #loc91]) +#loc158 = loc("new_m2"(#loc114)) +#loc159 = loc(callsite(#loc115 at #loc26)) +#loc160 = loc("new_weight"(#loc116)) +#loc161 = loc(callsite(#loc117 at #loc26)) +#loc162 = loc("new_mean"(#loc118)) +#loc163 = loc(callsite(#loc119 at #loc26)) +#loc164 = loc(callsite(#loc120 at #loc26)) +#loc165 = loc("new_m2"(#loc121)) +#loc166 = loc(callsite(#loc121 at #loc26)) +#loc167 = loc(callsite(#loc126 at #loc125)) +#loc168 = loc(callsite(#loc127 at #loc125)) +#loc169 = loc(callsite(#loc128 at #loc125)) +#loc170 = loc(callsite(#loc129 at #loc125)) +#loc171 = loc(callsite(#loc130 at #loc125)) +#loc172 = loc(callsite(#loc47 at #loc125)) +#loc173 = loc(callsite(#loc48 at #loc125)) +#loc174 = loc(callsite(#loc49 at #loc125)) +#loc175 = loc(callsite(#loc50 at #loc125)) +#loc176 = loc(callsite(#loc51 at #loc125)) +#loc177 = loc(callsite(#loc52 at #loc125)) +#loc178 = loc(callsite(#loc53 at #loc125)) +#loc179 = loc(fused[#loc136, #loc135]) +#loc180 = loc(fused[#loc138, #loc91]) +#loc181 = loc("tmp7_weight"(#loc155)) +#loc182 = loc(callsite(#loc158 at #loc26)) +#loc183 = loc(callsite(#loc160 at #loc26)) +#loc184 = loc(callsite(#loc162 at #loc26)) +#loc185 = loc(callsite(#loc165 at #loc26)) diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..932ca86f4455410e14f569df43cc5bbd0861c738 --- /dev/null +++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.json"}} \ No newline at end of file diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..0802a2b59468ab17484cdbfbc8a186ebc2bb489c Binary files /dev/null and b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.cubin differ diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.json b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..01f2a19d02cbe3d66a037b1facd055735996ac17 --- /dev/null +++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.json @@ -0,0 +1 @@ +{"hash": "422567694a01727b4809fa4d834df960bfcb1e09ff49176781ab3976a4849bfb", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 8, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"} \ No newline at end of file diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.llir b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..f1910083599ab90fb7aa12f23b7d3438c792e868 --- /dev/null +++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.llir @@ -0,0 +1,136 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl nuw i32 %7, 1, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 32, !dbg !9 + %.lobit = lshr exact i32 %10, 5, !dbg !9 + %11 = and i32 %9, 1, !dbg !9 + %12 = or disjoint i32 %.lobit, %8, !dbg !10 + %13 = or disjoint i32 %8, %11, !dbg !10 + %14 = shl nuw nsw i32 %9, 2, !dbg !11 + %15 = and i32 %14, 124, !dbg !11 + %16 = sdiv i32 %12, 32, !dbg !12 + %17 = mul i32 %16, 32, !dbg !13 + %.decomposed = sub i32 %12, %17, !dbg !13 + %18 = shl nsw i32 %.decomposed, 7, !dbg !14 + %19 = or disjoint i32 %18, %15, !dbg !15 + %20 = mul i32 %16, 12288, !dbg !16 + %21 = add i32 %19, %20, !dbg !17 + %22 = sext i32 %21 to i64, !dbg !18 + %23 = getelementptr bfloat, ptr addrspace(1) %0, i64 %22, !dbg !18 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %25 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %23, i64 %24, i1 true) #4, !dbg !19 + %26 = extractvalue { i32, i32 } %25, 0, !dbg !19 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !19 + %28 = extractvalue { i32, i32 } %25, 1, !dbg !19 + %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !19 + %30 = extractelement <2 x bfloat> %27, i64 0, !dbg !19 + %31 = extractelement <2 x bfloat> %27, i64 1, !dbg !19 + %32 = extractelement <2 x bfloat> %29, i64 0, !dbg !19 + %33 = extractelement <2 x bfloat> %29, i64 1, !dbg !19 + %34 = fpext bfloat %30 to float, !dbg !20 + %35 = fpext bfloat %31 to float, !dbg !20 + %36 = fpext bfloat %32 to float, !dbg !20 + %37 = fpext bfloat %33 to float, !dbg !20 + %38 = fmul float %34, %34, !dbg !21 + %39 = fmul float %35, %35, !dbg !21 + %40 = fmul float %36, %36, !dbg !21 + %41 = fmul float %37, %37, !dbg !21 + %42 = fadd float %38, %39, !dbg !22 + %43 = fadd float %40, %42, !dbg !22 + %44 = fadd float %41, %43, !dbg !22 + %45 = bitcast float %44 to i32, !dbg !25 + %46 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %45, i32 16, i32 31), !dbg !25 + %47 = bitcast i32 %46 to float, !dbg !25 + %48 = fadd float %44, %47, !dbg !22 + %49 = bitcast float %48 to i32, !dbg !25 + %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %49, i32 8, i32 31), !dbg !25 + %51 = bitcast i32 %50 to float, !dbg !25 + %52 = fadd float %48, %51, !dbg !22 + %53 = bitcast float %52 to i32, !dbg !25 + %54 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 4, i32 31), !dbg !25 + %55 = bitcast i32 %54 to float, !dbg !25 + %56 = fadd float %52, %55, !dbg !22 + %57 = bitcast float %56 to i32, !dbg !25 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %57, i32 2, i32 31), !dbg !25 + %59 = bitcast i32 %58 to float, !dbg !25 + %60 = fadd float %56, %59, !dbg !22 + %61 = bitcast float %60 to i32, !dbg !25 + %62 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %61, i32 1, i32 31), !dbg !25 + %63 = bitcast i32 %62 to float, !dbg !25 + %64 = fadd float %60, %63, !dbg !22 + %65 = lshr exact i32 %10, 3, !dbg !28 + %66 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %65, !dbg !28 + store float %64, ptr addrspace(3) %66, align 4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %67 = shl nuw nsw i32 %11, 2, !dbg !28 + %68 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %67, !dbg !28 + %69 = load i32, ptr addrspace(3) %68, align 4, !dbg !28 + %70 = sext i32 %13 to i64, !dbg !29 + %71 = getelementptr float, ptr addrspace(1) %1, i64 %70, !dbg !29 + %72 = and i32 %9, 62, !dbg !30 + %73 = icmp eq i32 %72, 0, !dbg !30 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %69, ptr addrspace(1) %71, i1 %73) #4, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 41, scope: !4) +!16 = !DILocation(line: 38, column: 56, scope: !4) +!17 = !DILocation(line: 38, column: 50, scope: !4) +!18 = !DILocation(line: 38, column: 34, scope: !4) +!19 = !DILocation(line: 38, column: 61, scope: !4) +!20 = !DILocation(line: 38, column: 115, scope: !4) +!21 = !DILocation(line: 40, column: 22, scope: !4) +!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26) +!26 = !DILocation(line: 44, column: 25, scope: !27) +!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!28 = !DILocation(line: 44, column: 28, scope: !4) +!29 = !DILocation(line: 45, column: 25, scope: !4) +!30 = !DILocation(line: 45, column: 36, scope: !4) +!31 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..f0f59859a87382a24f528ca0c118a3d521f76b7c --- /dev/null +++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ptx @@ -0,0 +1,506 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_0 +.visible .entry triton_red_fused__fused_rms_norm_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5 +) +.reqntid 64 +{ + .reg .pred %p<3>; + .reg .b16 %rs<5>; + .reg .b32 %r<48>; + .reg .b64 %rd<6>; + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_red_fused__fused_rms_norm_view_0_param_0]; + ld.param.b64 %rd5, [triton_red_fused__fused_rms_norm_view_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28 + mov.u32 %r5, %ctaid.x; + .loc 1 23 33 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33 + shl.b32 %r6, %r5, 1; + .loc 1 24 44 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44 + mov.u32 %r7, %tid.x; + and.b32 %r8, %r7, 32; + bfe.u32 %r9, %r7, 5, 1; + and.b32 %r10, %r7, 1; + .loc 1 24 23 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23 + or.b32 %r11, %r9, %r6; + or.b32 %r12, %r6, %r10; + .loc 1 26 37 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37 + shl.b32 %r13, %r7, 2; + and.b32 %r14, %r13, 124; + .loc 1 29 19 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19 + bfe.s32 %r15, %r5, 30, 1; + shr.u32 %r16, %r15, 27; + add.s32 %r17, %r11, %r16; + shr.u32 %r18, %r17, 5; + .loc 1 28 19 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:28:19 + and.b32 %r19, %r17, 33554400; + sub.s32 %r20, %r11, %r19; + .loc 1 38 45 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:45 + shl.b32 %r21, %r20, 7; + .loc 1 38 41 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:41 + or.b32 %r22, %r21, %r14; + .loc 1 38 50 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:50 + mad.lo.s32 %r23, %r18, 12288, %r22; + .loc 1 38 34 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34 + mad.wide.s32 %rd1, %r23, 2, %rd4; + .loc 1 38 61 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + .loc 1 38 115 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115 + cvt.f32.bf16 %r24, %rs1; + cvt.f32.bf16 %r25, %rs2; + cvt.f32.bf16 %r26, %rs3; + cvt.f32.bf16 %r27, %rs4; + .loc 1 40 22 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22 + mul.f32 %r28, %r25, %r25; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + fma.rn.f32 %r29, %r24, %r24, %r28; + fma.rn.f32 %r30, %r26, %r26, %r29; + fma.rn.f32 %r31, %r27, %r27, %r30; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r32, %r31, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r33, %r31, %r32; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r34, %r33, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r35, %r33, %r34; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r36, %r35, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r37, %r35, %r36; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r39, %r37, %r38; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] + shfl.sync.bfly.b32 %r40, %r39, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ] + add.f32 %r41, %r39, %r40; +$L__tmp12: + .loc 1 44 28 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28 + shr.u32 %r42, %r8, 3; + mov.b32 %r43, global_smem; + add.s32 %r44, %r43, %r42; + st.shared.b32 [%r44], %r41; + bar.sync 0; + shl.b32 %r45, %r10, 2; + add.s32 %r46, %r43, %r45; + ld.shared.b32 %r4, [%r46]; + .loc 1 45 25 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25 + mad.wide.s32 %rd3, %r12, 4, %rd5; + .loc 1 45 36 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36 + and.b32 %r47, %r7, 62; + setp.eq.b32 %p2, %r47, 0; + // begin inline asm + @%p2 st.global.b32 [ %rd3 + 0 ], { %r4 }; + // end inline asm + .loc 1 45 4 // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4 + ret; +$L__tmp13: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 118 +.b8 121 +.b8 116 +.b8 52 +.b8 50 +.b8 55 +.b8 51 +.b8 105 +.b8 117 +.b8 51 +.b8 51 +.b8 109 +.b8 112 +.b8 101 +.b8 101 +.b8 55 +.b8 104 +.b8 98 +.b8 101 +.b8 116 +.b8 53 +.b8 106 +.b8 53 +.b8 101 +.b8 113 +.b8 52 +.b8 52 +.b8 100 +.b8 54 +.b8 102 +.b8 115 +.b8 104 +.b8 103 +.b8 119 +.b8 107 +.b8 121 +.b8 120 +.b8 107 +.b8 110 +.b8 53 +.b8 50 +.b8 103 +.b8 103 +.b8 103 +.b8 107 +.b8 105 +.b8 113 +.b8 104 +.b8 106 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 119 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.source b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..74292b61b4e3768b12b3e3f2202229378c7a7bde --- /dev/null +++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 8192 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 2 : i32 loc(#loc49) + %xoffset_3 = arith.constant 2 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<2x128xi1> loc(#loc53) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<2x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<2x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c128_i32 = arith.constant 128 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<2x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<2x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<2x128xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<2x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<2x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<2x128xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<2x128x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<2x128xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<2x128xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<2x128xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<2x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<2x1x!tt.ptr>, tensor<2x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<2x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x128xf32> loc("input"(#loc33))) -> tensor<2xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc34) + tt.return %0 : tensor<2xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2xf32> loc(#loc37) + tt.return %1 : tensor<2xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..96d664b0b0bc11ab1284ba1b2348b8a772f48057 --- /dev/null +++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir @@ -0,0 +1,108 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc1 = loc(unknown) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc27 = loc("in_ptr0"(#loc)) +#loc28 = loc("out_ptr0"(#loc)) +#loc29 = loc("xnumel"(#loc)) +#loc30 = loc("r0_numel"(#loc)) +#loc49 = loc("tmp4"(#loc21)) +#loc52 = loc(callsite(#loc1 at #loc49)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<2x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<2x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<2x1xi32, #blocked> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc31) + %xoffset_5 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc32) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33) + %xindex_6 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2x1xi32, #blocked> loc(#loc33) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xi32, #blocked1> loc(#loc33) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<2x1xi32, #blocked> loc(#loc34) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<2x1xi32, #blocked1> loc(#loc34) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<2x1xi32, #blocked> loc(#loc34) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<2x1xi32, #blocked1> loc(#loc34) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35) + %x0 = arith.remsi %xindex_11, %cst_2 : tensor<2x1xi32, #blocked> loc(#loc36) + %x1 = arith.divsi %xindex_11, %cst_2 : tensor<2x1xi32, #blocked> loc(#loc37) + %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38) + %tmp0 = arith.muli %x0, %cst_0 : tensor<2x1xi32, #blocked> loc(#loc39) + %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc40) + %tmp0_15 = tt.broadcast %tmp0 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc40) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<2x128xi32, #blocked> loc(#loc40) + %tmp0_17 = arith.muli %x1, %cst_1 : tensor<2x1xi32, #blocked> loc(#loc41) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc42) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<2x128xi32, #blocked> loc(#loc42) + %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr, #blocked> loc(#loc43) + %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<2x128x!tt.ptr, #blocked>, tensor<2x128xi32, #blocked> loc(#loc43) + %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc44) + %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<2x128x!tt.ptr, #blocked> loc(#loc44) + %tmp0_24 = arith.extf %tmp0_23 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc45) + %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<2x128xf32, #blocked> loc(#loc46) + %tmp5 = arith.addf %tmp2, %cst_4 : tensor<2x128xf32, #blocked> loc(#loc47) + %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc48) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))): + %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53) + tt.reduce.return %tmp4_29 : f32 loc(#loc51) + }) : (tensor<2x128xf32, #blocked>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51) + %tmp4_25 = ttg.convert_layout %tmp4 : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50) + %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc50) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<2x1x!tt.ptr, #blocked1> loc(#loc24) + %1 = tt.addptr %0, %xindex_12 : tensor<2x1x!tt.ptr, #blocked1>, tensor<2x1xi32, #blocked1> loc(#loc24) + tt.store %1, %tmp4_26 : tensor<2x1x!tt.ptr, #blocked1> loc(#loc25) + tt.return loc(#loc26) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc31 = loc("xoffset"(#loc2)) +#loc32 = loc("xoffset"(#loc3)) +#loc33 = loc("xindex"(#loc4)) +#loc34 = loc("xindex"(#loc5)) +#loc35 = loc("r0_base"(#loc6)) +#loc36 = loc("x0"(#loc7)) +#loc37 = loc("x1"(#loc8)) +#loc38 = loc("r0_mask"(#loc9)) +#loc39 = loc("tmp0"(#loc10)) +#loc40 = loc("tmp0"(#loc11)) +#loc41 = loc("tmp0"(#loc12)) +#loc42 = loc("tmp0"(#loc13)) +#loc43 = loc("tmp0"(#loc14)) +#loc44 = loc("tmp0"(#loc15)) +#loc45 = loc("tmp0"(#loc16)) +#loc46 = loc("tmp2"(#loc17)) +#loc47 = loc("tmp5"(#loc18)) +#loc48 = loc("_tmp4"(#loc19)) +#loc50 = loc("tmp4"(#loc23)) +#loc51 = loc(callsite(#loc20 at #loc49)) +#loc53 = loc(callsite(#loc22 at #loc51)) diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..df05c30e88f2bca31a6d1a5cd7292038c85559b0 --- /dev/null +++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttir @@ -0,0 +1,105 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0) +#loc2 = loc(unknown) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25) +#loc29 = loc("in_ptr0"(#loc)) +#loc30 = loc("out_ptr0"(#loc)) +#loc31 = loc("xnumel"(#loc)) +#loc32 = loc("r0_numel"(#loc)) +#loc53 = loc("tmp4"(#loc23)) +#loc56 = loc(callsite(#loc2 at #loc53)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16> loc(#loc33) + %cst = arith.constant dense<12288> : tensor<2x1xi32> loc(#loc2) + %cst_0 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc2) + %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc2) + %cst_3 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc2) + %c2_i32 = arith.constant 2 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_4 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc36) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc37) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc38) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc38) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40) + %x0 = arith.remsi %xindex_7, %cst_3 : tensor<2x1xi32> loc(#loc41) + %x1 = arith.divsi %xindex_7, %cst_3 : tensor<2x1xi32> loc(#loc42) + %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43) + %tmp0_9 = arith.muli %x0, %cst_0 : tensor<2x1xi32> loc(#loc44) + %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc45) + %tmp0_11 = tt.broadcast %tmp0_9 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc45) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<2x128xi32> loc(#loc45) + %tmp0_13 = arith.muli %x1, %cst : tensor<2x1xi32> loc(#loc46) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc47) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<2x128xi32> loc(#loc47) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x128x!tt.ptr> loc(#loc48) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<2x128x!tt.ptr>, tensor<2x128xi32> loc(#loc48) + %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc33) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<2x128x!tt.ptr> loc(#loc33) + %tmp0_20 = arith.extf %tmp0_19 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc49) + %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<2x128xf32> loc(#loc50) + %tmp5 = arith.addf %tmp2, %cst_2 : tensor<2x128xf32> loc(#loc51) + %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc52) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))): + %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57) + tt.reduce.return %tmp4_24 : f32 loc(#loc55) + }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc55) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc54) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<2x1x!tt.ptr> loc(#loc26) + %1 = tt.addptr %0, %xindex_7 : tensor<2x1x!tt.ptr>, tensor<2x1xi32> loc(#loc26) + tt.store %1, %tmp4_21 : tensor<2x1x!tt.ptr> loc(#loc27) + tt.return loc(#loc28) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4) +#loc33 = loc("tmp0"(#loc1)) +#loc34 = loc("xoffset"(#loc3)) +#loc35 = loc("xoffset"(#loc4)) +#loc36 = loc("xindex"(#loc5)) +#loc37 = loc("xindex"(#loc6)) +#loc38 = loc("xindex"(#loc7)) +#loc39 = loc("r0_base"(#loc8)) +#loc40 = loc("r0_base"(#loc9)) +#loc41 = loc("x0"(#loc10)) +#loc42 = loc("x1"(#loc11)) +#loc43 = loc("r0_mask"(#loc12)) +#loc44 = loc("tmp0"(#loc13)) +#loc45 = loc("tmp0"(#loc14)) +#loc46 = loc("tmp0"(#loc15)) +#loc47 = loc("tmp0"(#loc16)) +#loc48 = loc("tmp0"(#loc17)) +#loc49 = loc("tmp0"(#loc18)) +#loc50 = loc("tmp2"(#loc19)) +#loc51 = loc("tmp5"(#loc20)) +#loc52 = loc("_tmp4"(#loc21)) +#loc54 = loc("tmp4"(#loc25)) +#loc55 = loc(callsite(#loc22 at #loc53)) +#loc57 = loc(callsite(#loc24 at #loc55)) diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ced2e448a5053bde918d3e1dc9318c6d4465f51d --- /dev/null +++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json"}} \ No newline at end of file diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a88c26affae58e1efc7c2aeb3f00ac3c39ea3335 Binary files /dev/null and b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin differ diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b6c3904719b57b210bf4fca250b0fbe6f4b17003 --- /dev/null +++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json @@ -0,0 +1 @@ +{"hash": "4e0d42a901fc486d8a262b969a5ef7bb72a126c858c100717d2f698f70b5be1e", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1"} \ No newline at end of file diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..4e4ada2b9fffdf4df1a5d349c9dde847a043dce3 --- /dev/null +++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir @@ -0,0 +1,78 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 9, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = shl nuw nsw i32 %9, 1, !dbg !9 + %11 = and i32 %10, 510, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = or i32 %10, %8, !dbg !10 + %14 = or disjoint i32 %13, 1, !dbg !10 + %15 = sdiv i32 %12, 128, !dbg !11 + %16 = mul i32 %15, 128, !dbg !12 + %.decomposed = sub i32 %12, %16, !dbg !12 + %17 = srem i32 %14, 128, !dbg !12 + %18 = srem i32 %15, 2304, !dbg !13 + %19 = sdiv i32 %12, 294912, !dbg !14 + %20 = shl nsw i32 %19, 7, !dbg !15 + %21 = add nsw i32 %20, %.decomposed, !dbg !16 + %22 = add nsw i32 %20, %17, !dbg !16 + %23 = sext i32 %18 to i64, !dbg !17 + %24 = mul i64 %2, %23, !dbg !17 + %25 = sext i32 %21 to i64, !dbg !18 + %26 = sext i32 %22 to i64, !dbg !18 + %27 = getelementptr bfloat, ptr addrspace(1) %0, i64 %24, !dbg !19 + %28 = getelementptr bfloat, ptr addrspace(1) %27, i64 %25, !dbg !19 + %29 = getelementptr bfloat, ptr addrspace(1) %27, i64 %26, !dbg !19 + %30 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %28) #2, !dbg !20 + %31 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %29) #2, !dbg !20 + %32 = sext i32 %12 to i64, !dbg !21 + %33 = getelementptr bfloat, ptr addrspace(1) %1, i64 %32, !dbg !21 + %34 = insertelement <2 x i16> poison, i16 %30, i64 0, !dbg !22 + %35 = insertelement <2 x i16> %34, i16 %31, i64 1, !dbg !22 + %36 = bitcast <2 x i16> %35 to i32, !dbg !22 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %36, ptr addrspace(1) %33) #2, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1", linkageName: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 23, column: 19, scope: !4) +!13 = !DILocation(line: 24, column: 28, scope: !4) +!14 = !DILocation(line: 25, column: 19, scope: !4) +!15 = !DILocation(line: 27, column: 39, scope: !4) +!16 = !DILocation(line: 27, column: 35, scope: !4) +!17 = !DILocation(line: 27, column: 48, scope: !4) +!18 = !DILocation(line: 27, column: 44, scope: !4) +!19 = !DILocation(line: 27, column: 30, scope: !4) +!20 = !DILocation(line: 27, column: 53, scope: !4) +!21 = !DILocation(line: 28, column: 25, scope: !4) +!22 = !DILocation(line: 28, column: 36, scope: !4) +!23 = !DILocation(line: 28, column: 4, scope: !4) diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..91999aebea37c8e6cc2bbfa06a376383ef0032bd --- /dev/null +++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx @@ -0,0 +1,347 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1 // -- Begin function triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1 + // @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1 +.visible .entry triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1( + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_1, + .param .u64 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_2, + .param .u32 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_5 +) +.reqntid 256 +{ + .reg .b16 %rs<3>; + .reg .b32 %r<32>; + .reg .b64 %rd<11>; + .loc 1 18 0 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_0]; + ld.param.b64 %rd5, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_1]; +$L__tmp0: + .loc 1 20 28 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:20:28 + mov.u32 %r2, %ctaid.x; + .loc 1 20 33 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:20:33 + shl.b32 %r3, %r2, 9; + ld.param.b64 %rd6, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_2]; + .loc 1 21 36 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 21 23 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:23 + or.b32 %r7, %r6, %r3; + or.b32 %r8, %r5, %r3; + or.b32 %r9, %r8, 1; + .loc 1 24 21 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:21 + bfe.s32 %r10, %r2, 22, 1; + .loc 1 23 19 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:23:19 + shr.u32 %r11, %r10, 25; + .loc 1 24 21 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:21 + add.s32 %r12, %r7, %r11; + shr.s32 %r13, %r12, 7; + .loc 1 23 19 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:23:19 + and.b32 %r14, %r12, -128; + sub.s32 %r15, %r7, %r14; + add.s32 %r16, %r9, %r11; + and.b32 %r17, %r16, -128; + sub.s32 %r18, %r9, %r17; + .loc 1 24 28 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:28 + mul.hi.s32 %r19, %r13, 954437177; + shr.u32 %r20, %r19, 31; + shr.s32 %r21, %r19, 9; + add.s32 %r22, %r21, %r20; + mul.lo.s32 %r23, %r22, 2304; + sub.s32 %r24, %r13, %r23; + .loc 1 25 19 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:25:19 + mul.hi.s32 %r25, %r7, 954437177; + shr.u32 %r26, %r25, 31; + shr.s32 %r27, %r25, 16; + add.s32 %r28, %r27, %r26; + .loc 1 27 39 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:39 + shl.b32 %r29, %r28, 7; + .loc 1 27 35 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:35 + add.s32 %r30, %r29, %r15; + add.s32 %r31, %r29, %r18; + .loc 1 27 48 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:48 + cvt.s64.s32 %rd7, %r24; + mul.lo.s64 %rd8, %rd6, %rd7; + .loc 1 27 30 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:30 + shl.b64 %rd9, %rd8, 1; + add.s64 %rd10, %rd4, %rd9; + mad.wide.s32 %rd1, %r30, 2, %rd10; + mad.wide.s32 %rd2, %r31, 2, %rd10; + .loc 1 27 53 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:53 + // begin inline asm + mov.u16 %rs1, 0x0; + ld.global.b16 { %rs1 }, [ %rd1 + 0 ]; + // end inline asm + // begin inline asm + mov.u16 %rs2, 0x0; + ld.global.b16 { %rs2 }, [ %rd2 + 0 ]; + // end inline asm + .loc 1 28 25 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:25 + mad.wide.s32 %rd3, %r7, 2, %rd5; + .loc 1 28 36 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:36 + mov.b32 %r1, {%rs1, %rs2}; + // begin inline asm + st.global.b32 [ %rd3 + 0 ], { %r1 }; + // end inline asm + .loc 1 28 4 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 51 +.b8 118 +.b8 106 +.b8 105 +.b8 108 +.b8 118 +.b8 99 +.b8 121 +.b8 55 +.b8 115 +.b8 100 +.b8 113 +.b8 99 +.b8 97 +.b8 120 +.b8 102 +.b8 115 +.b8 112 +.b8 102 +.b8 102 +.b8 97 +.b8 100 +.b8 98 +.b8 115 +.b8 114 +.b8 121 +.b8 51 +.b8 115 +.b8 113 +.b8 109 +.b8 52 +.b8 106 +.b8 55 +.b8 113 +.b8 112 +.b8 54 +.b8 117 +.b8 51 +.b8 116 +.b8 117 +.b8 115 +.b8 114 +.b8 54 +.b8 112 +.b8 51 +.b8 52 +.b8 115 +.b8 98 +.b8 105 +.b8 97 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 51 +.b8 118 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source new file mode 100644 index 0000000000000000000000000000000000000000..ee6869b52e7acd12652977b4b7c9ec9bbff8e6ee --- /dev/null +++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source @@ -0,0 +1,91 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("out_ptr0"(#loc)) +#loc23 = loc("ks0"(#loc)) +#loc24 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc25) + %xoffset = tt.get_program_id x : i32 loc(#loc26) + %xoffset_1 = arith.constant 512 : i32 loc(#loc27) + %xoffset_2 = arith.constant 512 : i32 loc(#loc27) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc27) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc28) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc29) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc29) + %xmask = arith.constant true loc(#loc30) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc30) + %x0 = arith.constant 128 : i32 loc(#loc31) + %x0_7 = arith.constant 128 : i32 loc(#loc31) + %x0_8 = arith.constant dense<128> : tensor<512xi32> loc(#loc31) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc31) + %x1 = arith.constant 128 : i32 loc(#loc32) + %x1_10 = arith.constant 128 : i32 loc(#loc32) + %x1_11 = arith.constant dense<128> : tensor<512xi32> loc(#loc32) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc32) + %x1_13 = arith.constant 2304 : i32 loc(#loc33) + %x1_14 = arith.constant 2304 : i32 loc(#loc33) + %x1_15 = arith.constant dense<2304> : tensor<512xi32> loc(#loc33) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<512xi32> loc(#loc33) + %x2 = arith.constant 294912 : i32 loc(#loc34) + %x2_17 = arith.constant 294912 : i32 loc(#loc34) + %x2_18 = arith.constant dense<294912> : tensor<512xi32> loc(#loc34) + %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<512xi32> loc(#loc34) + %tmp0 = arith.constant 128 : i32 loc(#loc35) + %tmp0_20 = arith.constant 128 : i32 loc(#loc35) + %tmp0_21 = arith.constant dense<128> : tensor<512xi32> loc(#loc35) + %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<512xi32> loc(#loc35) + %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<512xi32> loc(#loc36) + %tmp0_24 = arith.extsi %x1_16 : tensor<512xi32> to tensor<512xi64> loc(#loc37) + %tmp0_25 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc37) + %tmp0_26 = arith.muli %tmp0_25, %tmp0_24 : tensor<512xi64> loc(#loc37) + %tmp0_27 = arith.extsi %tmp0_23 : tensor<512xi32> to tensor<512xi64> loc(#loc38) + %tmp0_28 = arith.addi %tmp0_27, %tmp0_26 : tensor<512xi64> loc(#loc38) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc39) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc39) + %tmp0_31 = tt.load %tmp0_30 : tensor<512x!tt.ptr> loc(#loc40) + %tmp0_32 = arith.extf %tmp0_31 : tensor<512xbf16> to tensor<512xf32> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc18) + %2 = arith.truncf %tmp0_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc19) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:62) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4) +#loc25 = loc("xnumel"(#loc1)) +#loc26 = loc("xoffset"(#loc2)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xindex"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("xmask"(#loc6)) +#loc31 = loc("x0"(#loc7)) +#loc32 = loc("x1"(#loc8)) +#loc33 = loc("x1"(#loc9)) +#loc34 = loc("x2"(#loc10)) +#loc35 = loc("tmp0"(#loc11)) +#loc36 = loc("tmp0"(#loc12)) +#loc37 = loc("tmp0"(#loc13)) +#loc38 = loc("tmp0"(#loc14)) +#loc39 = loc("tmp0"(#loc15)) +#loc40 = loc("tmp0"(#loc16)) +#loc41 = loc("tmp0"(#loc17)) diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f1ea43d06080fe21d8e550b12521c39e21772378 --- /dev/null +++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir @@ -0,0 +1,69 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("ks0"(#loc)) +#loc22 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<512xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<2304> : tensor<512xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<294912> : tensor<512xi32, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc23) + %xoffset_2 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc24) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc25) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32, #blocked> loc(#loc26) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32, #blocked> loc(#loc26) + %x0 = arith.remsi %xindex_4, %cst : tensor<512xi32, #blocked> loc(#loc27) + %x1 = arith.divsi %xindex_4, %cst : tensor<512xi32, #blocked> loc(#loc28) + %x1_5 = arith.remsi %x1, %cst_0 : tensor<512xi32, #blocked> loc(#loc29) + %x2 = arith.divsi %xindex_4, %cst_1 : tensor<512xi32, #blocked> loc(#loc30) + %tmp0 = arith.muli %x2, %cst : tensor<512xi32, #blocked> loc(#loc31) + %tmp0_6 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc32) + %tmp0_7 = arith.extsi %x1_5 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc33) + %tmp0_8 = tt.splat %ks0 : i64 -> tensor<512xi64, #blocked> loc(#loc33) + %tmp0_9 = arith.muli %tmp0_8, %tmp0_7 : tensor<512xi64, #blocked> loc(#loc33) + %tmp0_10 = arith.extsi %tmp0_6 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc34) + %tmp0_11 = arith.addi %tmp0_10, %tmp0_9 : tensor<512xi64, #blocked> loc(#loc34) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc35) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc35) + %tmp0_14 = tt.load %tmp0_13 : tensor<512x!tt.ptr, #blocked> loc(#loc36) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc16) + %1 = tt.addptr %0, %xindex_4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc16) + tt.store %1, %tmp0_14 : tensor<512x!tt.ptr, #blocked> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4) +#loc23 = loc("xoffset"(#loc2)) +#loc24 = loc("xoffset"(#loc3)) +#loc25 = loc("xindex"(#loc4)) +#loc26 = loc("xindex"(#loc5)) +#loc27 = loc("x0"(#loc6)) +#loc28 = loc("x1"(#loc7)) +#loc29 = loc("x1"(#loc8)) +#loc30 = loc("x2"(#loc9)) +#loc31 = loc("tmp0"(#loc10)) +#loc32 = loc("tmp0"(#loc11)) +#loc33 = loc("tmp0"(#loc12)) +#loc34 = loc("tmp0"(#loc13)) +#loc35 = loc("tmp0"(#loc14)) +#loc36 = loc("tmp0"(#loc15)) diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..5d28d519f32fcd08b0182402e63035db0aec1a21 --- /dev/null +++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir @@ -0,0 +1,68 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("ks0"(#loc)) +#loc22 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %x2 = arith.constant dense<294912> : tensor<512xi32> loc(#loc23) + %x1 = arith.constant dense<2304> : tensor<512xi32> loc(#loc24) + %cst = arith.constant dense<128> : tensor<512xi32> loc(#loc3) + %c512_i32 = arith.constant 512 : i32 loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc26) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc28) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc28) + %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32> loc(#loc29) + %x1_3 = arith.divsi %xindex_2, %cst : tensor<512xi32> loc(#loc30) + %x1_4 = arith.remsi %x1_3, %x1 : tensor<512xi32> loc(#loc24) + %x2_5 = arith.divsi %xindex_2, %x2 : tensor<512xi32> loc(#loc23) + %tmp0 = arith.muli %x2_5, %cst : tensor<512xi32> loc(#loc31) + %tmp0_6 = arith.addi %x0, %tmp0 : tensor<512xi32> loc(#loc32) + %tmp0_7 = arith.extsi %x1_4 : tensor<512xi32> to tensor<512xi64> loc(#loc33) + %tmp0_8 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc33) + %tmp0_9 = arith.muli %tmp0_8, %tmp0_7 : tensor<512xi64> loc(#loc33) + %tmp0_10 = arith.extsi %tmp0_6 : tensor<512xi32> to tensor<512xi64> loc(#loc34) + %tmp0_11 = arith.addi %tmp0_10, %tmp0_9 : tensor<512xi64> loc(#loc34) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc35) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc35) + %tmp0_14 = tt.load %tmp0_13 : tensor<512x!tt.ptr> loc(#loc36) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc16) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc16) + tt.store %1, %tmp0_14 : tensor<512x!tt.ptr> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28) +#loc3 = loc(unknown) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4) +#loc23 = loc("x2"(#loc1)) +#loc24 = loc("x1"(#loc2)) +#loc25 = loc("xoffset"(#loc4)) +#loc26 = loc("xoffset"(#loc5)) +#loc27 = loc("xindex"(#loc6)) +#loc28 = loc("xindex"(#loc7)) +#loc29 = loc("x0"(#loc8)) +#loc30 = loc("x1"(#loc9)) +#loc31 = loc("tmp0"(#loc10)) +#loc32 = loc("tmp0"(#loc11)) +#loc33 = loc("tmp0"(#loc12)) +#loc34 = loc("tmp0"(#loc13)) +#loc35 = loc("tmp0"(#loc14)) +#loc36 = loc("tmp0"(#loc15)) diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7a504b8d523622799a4e500e32ee3409f7634151 --- /dev/null +++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json"}} \ No newline at end of file diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..4bbfb628bbda9fc4f510def2e7c473e8ff845e28 Binary files /dev/null and b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4d400ea1b75f80f42a8d60372e019c868d172932 --- /dev/null +++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"hash": "57bf851d5f90e2c4d3f1b9eb74cf4a753207711de6b724a474200b48c8f12649", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"} \ No newline at end of file diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..485e85db49eb1197102cae7b9b2285d2d718c6ba --- /dev/null +++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir @@ -0,0 +1,770 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10 + %15 = mul nuw i32 %13, %14, !dbg !11 + %16 = add nuw i32 %15, %12, !dbg !12 + %17 = shl i32 %16, 5, !dbg !13 + %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14 + %19 = lshr i32 %18, 2, !dbg !14 + %20 = and i32 %19, 31, !dbg !14 + %21 = and i32 %18, 7, !dbg !14 + %22 = shl nuw nsw i32 %21, 2, !dbg !14 + %23 = or disjoint i32 %17, %20, !dbg !15 + %24 = or disjoint i32 %17, %22, !dbg !15 + %25 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16 + %26 = shl i32 %25, 5, !dbg !17 + %27 = shl nuw nsw i32 %18, 3, !dbg !18 + %28 = and i32 %27, 24, !dbg !18 + %29 = lshr i32 %18, 3, !dbg !18 + %30 = and i32 %29, 15, !dbg !18 + %31 = or disjoint i32 %28, %26, !dbg !19 + %32 = or disjoint i32 %30, %26, !dbg !19 + %33 = icmp slt i32 %31, 128, !dbg !20 + %34 = icmp slt i32 %32, 128, !dbg !20 + %35 = sdiv i32 %23, 32, !dbg !21 + %36 = sdiv i32 %24, 32, !dbg !21 + %37 = mul i32 %35, 32, !dbg !22 + %.decomposed = sub i32 %23, %37, !dbg !22 + %38 = mul i32 %36, 32, !dbg !22 + %.decomposed53 = sub i32 %24, %38, !dbg !22 + %39 = icmp slt i32 %23, 8192, !dbg !23 + %40 = icmp slt i32 %24, 8192, !dbg !23 + %41 = shl nsw i32 %.decomposed, 7, !dbg !24 + %42 = add i32 %41, %31, !dbg !25 + %43 = mul i32 %35, 12288, !dbg !26 + %44 = add i32 %42, %43, !dbg !27 + %45 = sext i32 %44 to i64, !dbg !28 + %46 = getelementptr bfloat, ptr addrspace(1) %0, i64 %45, !dbg !28 + %47 = and i1 %33, %39, !dbg !29 + %48 = and i1 %34, %40, !dbg !29 + %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !30 + %50 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %46, i64 %49, i1 %47) #5, !dbg !30 + %51 = extractvalue { i32, i32, i32, i32 } %50, 0, !dbg !30 + %52 = extractvalue { i32, i32, i32, i32 } %50, 1, !dbg !30 + %53 = extractvalue { i32, i32, i32, i32 } %50, 2, !dbg !30 + %54 = extractvalue { i32, i32, i32, i32 } %50, 3, !dbg !30 + %55 = insertelement <2 x i32> poison, i32 %51, i64 0, !dbg !30 + %56 = insertelement <2 x i32> %55, i32 %53, i64 1, !dbg !30 + %57 = lshr <2 x i32> %56, splat (i32 16), !dbg !30 + %58 = trunc nuw <2 x i32> %57 to <2 x i16>, !dbg !30 + %59 = insertelement <2 x i32> poison, i32 %52, i64 0, !dbg !30 + %60 = insertelement <2 x i32> %59, i32 %54, i64 1, !dbg !30 + %61 = lshr <2 x i32> %60, splat (i32 16), !dbg !30 + %62 = trunc nuw <2 x i32> %61 to <2 x i16>, !dbg !30 + %63 = shl nuw nsw i32 %18, 5, !dbg !31 + %64 = and i32 %63, 480, !dbg !31 + %65 = and i32 %18, 12, !dbg !31 + %66 = shl nuw nsw i32 %65, 1, !dbg !31 + %67 = and i32 %18, 112, !dbg !31 + %68 = lshr exact i32 %67, 2, !dbg !31 + %69 = or disjoint i32 %64, %66, !dbg !31 + %70 = xor i32 %69, %68, !dbg !31 + %71 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %70, !dbg !31 + %72 = trunc i32 %51 to i16, !dbg !31 + %73 = trunc i32 %53 to i16, !dbg !31 + %74 = insertelement <2 x i16> poison, i16 %72, i64 0, !dbg !31 + %75 = insertelement <2 x i16> %74, i16 %73, i64 1, !dbg !31 + store <2 x i16> %75, ptr addrspace(3) %71, align 4, !dbg !31 + %76 = xor i32 %70, 544, !dbg !31 + %77 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %76, !dbg !31 + store <2 x i16> %58, ptr addrspace(3) %77, align 4, !dbg !31 + %78 = xor i32 %70, 1088, !dbg !31 + %79 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %78, !dbg !31 + %80 = trunc i32 %52 to i16, !dbg !31 + %81 = trunc i32 %54 to i16, !dbg !31 + %82 = insertelement <2 x i16> poison, i16 %80, i64 0, !dbg !31 + %83 = insertelement <2 x i16> %82, i16 %81, i64 1, !dbg !31 + store <2 x i16> %83, ptr addrspace(3) %79, align 4, !dbg !31 + %84 = xor i32 %70, 1632, !dbg !31 + %85 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %84, !dbg !31 + store <2 x i16> %62, ptr addrspace(3) %85, align 4, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31 + %86 = shl nuw nsw i32 %18, 6, !dbg !31 + %87 = and i32 %86, 1536, !dbg !31 + %88 = shl nuw nsw i32 %18, 2, !dbg !31 + %89 = and i32 %88, 124, !dbg !31 + %90 = and i32 %18, 32, !dbg !31 + %91 = lshr exact i32 %90, 4, !dbg !31 + %92 = and i32 %18, 64, !dbg !31 + %93 = lshr exact i32 %92, 1, !dbg !31 + %94 = or disjoint i32 %87, %89, !dbg !31 + %95 = xor i32 %94, %93, !dbg !31 + %96 = or disjoint i32 %95, %91, !dbg !31 + %97 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %96, !dbg !31 + %98 = load bfloat, ptr addrspace(3) %97, align 2, !dbg !31 + %99 = xor i32 %96, 136, !dbg !31 + %100 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %99, !dbg !31 + %101 = load bfloat, ptr addrspace(3) %100, align 2, !dbg !31 + %102 = xor i32 %96, 272, !dbg !31 + %103 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %102, !dbg !31 + %104 = load bfloat, ptr addrspace(3) %103, align 2, !dbg !31 + %105 = xor i32 %96, 408, !dbg !31 + %106 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %105, !dbg !31 + %107 = load bfloat, ptr addrspace(3) %106, align 2, !dbg !31 + %108 = xor i32 %96, 64, !dbg !31 + %109 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %108, !dbg !31 + %110 = load bfloat, ptr addrspace(3) %109, align 2, !dbg !31 + %111 = xor i32 %96, 200, !dbg !31 + %112 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %111, !dbg !31 + %113 = load bfloat, ptr addrspace(3) %112, align 2, !dbg !31 + %114 = xor i32 %96, 336, !dbg !31 + %115 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %114, !dbg !31 + %116 = load bfloat, ptr addrspace(3) %115, align 2, !dbg !31 + %117 = xor i32 %96, 472, !dbg !31 + %118 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %117, !dbg !31 + %119 = load bfloat, ptr addrspace(3) %118, align 2, !dbg !31 + %120 = fpext bfloat %98 to float, !dbg !31 + %121 = fpext bfloat %101 to float, !dbg !31 + %122 = fpext bfloat %104 to float, !dbg !31 + %123 = fpext bfloat %107 to float, !dbg !31 + %124 = fpext bfloat %110 to float, !dbg !31 + %125 = fpext bfloat %113 to float, !dbg !31 + %126 = fpext bfloat %116 to float, !dbg !31 + %127 = fpext bfloat %119 to float, !dbg !31 + %128 = sext i32 %24 to i64, !dbg !32 + %129 = getelementptr float, ptr addrspace(1) %1, i64 %128, !dbg !32 + %130 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !33 + %131 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %129, i64 %130, i1 %48) #5, !dbg !33 + %132 = extractvalue { i32, i32, i32, i32 } %131, 0, !dbg !33 + %133 = extractvalue { i32, i32, i32, i32 } %131, 1, !dbg !33 + %134 = extractvalue { i32, i32, i32, i32 } %131, 2, !dbg !33 + %135 = extractvalue { i32, i32, i32, i32 } %131, 3, !dbg !33 + %136 = bitcast i32 %132 to float, !dbg !33 + %137 = bitcast i32 %133 to float, !dbg !33 + %138 = bitcast i32 %134 to float, !dbg !33 + %139 = bitcast i32 %135 to float, !dbg !33 + %140 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !33 + %141 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %129, i64 %140, i1 %48) #5, !dbg !33 + %142 = extractvalue { i32, i32, i32, i32 } %141, 0, !dbg !33 + %143 = extractvalue { i32, i32, i32, i32 } %141, 1, !dbg !33 + %144 = extractvalue { i32, i32, i32, i32 } %141, 2, !dbg !33 + %145 = extractvalue { i32, i32, i32, i32 } %141, 3, !dbg !33 + %146 = bitcast i32 %142 to float, !dbg !33 + %147 = bitcast i32 %143 to float, !dbg !33 + %148 = bitcast i32 %144 to float, !dbg !33 + %149 = bitcast i32 %145 to float, !dbg !33 + %150 = tail call float @llvm.nvvm.div.full(float %136, float 1.280000e+02), !dbg !34 + %151 = tail call float @llvm.nvvm.div.full(float %137, float 1.280000e+02), !dbg !34 + %152 = tail call float @llvm.nvvm.div.full(float %138, float 1.280000e+02), !dbg !34 + %153 = tail call float @llvm.nvvm.div.full(float %139, float 1.280000e+02), !dbg !34 + %154 = tail call float @llvm.nvvm.div.full(float %146, float 1.280000e+02), !dbg !34 + %155 = tail call float @llvm.nvvm.div.full(float %147, float 1.280000e+02), !dbg !34 + %156 = tail call float @llvm.nvvm.div.full(float %148, float 1.280000e+02), !dbg !34 + %157 = tail call float @llvm.nvvm.div.full(float %149, float 1.280000e+02), !dbg !34 + %158 = fadd float %150, 0x3EB0C6F7A0000000, !dbg !35 + %159 = fadd float %151, 0x3EB0C6F7A0000000, !dbg !35 + %160 = fadd float %152, 0x3EB0C6F7A0000000, !dbg !35 + %161 = fadd float %153, 0x3EB0C6F7A0000000, !dbg !35 + %162 = fadd float %154, 0x3EB0C6F7A0000000, !dbg !35 + %163 = fadd float %155, 0x3EB0C6F7A0000000, !dbg !35 + %164 = fadd float %156, 0x3EB0C6F7A0000000, !dbg !35 + %165 = fadd float %157, 0x3EB0C6F7A0000000, !dbg !35 + %166 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i = icmp eq i32 %166, 0, !dbg !36 + br i1 %.not.i, label %169, label %167, !dbg !36 + +167: ; preds = %11 + %168 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %158), !dbg !36 + br label %__nv_rsqrtf.exit, !dbg !36 + +169: ; preds = %11 + %170 = tail call float @llvm.nvvm.rsqrt.approx.f(float %158), !dbg !36 + br label %__nv_rsqrtf.exit, !dbg !36 + +__nv_rsqrtf.exit: ; preds = %167, %169 + %.0.i = phi float [ %168, %167 ], [ %170, %169 ], !dbg !36 + %171 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i8 = icmp eq i32 %171, 0, !dbg !36 + br i1 %.not.i8, label %174, label %172, !dbg !36 + +172: ; preds = %__nv_rsqrtf.exit + %173 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %159), !dbg !36 + br label %__nv_rsqrtf.exit10, !dbg !36 + +174: ; preds = %__nv_rsqrtf.exit + %175 = tail call float @llvm.nvvm.rsqrt.approx.f(float %159), !dbg !36 + br label %__nv_rsqrtf.exit10, !dbg !36 + +__nv_rsqrtf.exit10: ; preds = %172, %174 + %.0.i9 = phi float [ %173, %172 ], [ %175, %174 ], !dbg !36 + %176 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i11 = icmp eq i32 %176, 0, !dbg !36 + br i1 %.not.i11, label %179, label %177, !dbg !36 + +177: ; preds = %__nv_rsqrtf.exit10 + %178 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %160), !dbg !36 + br label %__nv_rsqrtf.exit13, !dbg !36 + +179: ; preds = %__nv_rsqrtf.exit10 + %180 = tail call float @llvm.nvvm.rsqrt.approx.f(float %160), !dbg !36 + br label %__nv_rsqrtf.exit13, !dbg !36 + +__nv_rsqrtf.exit13: ; preds = %177, %179 + %.0.i12 = phi float [ %178, %177 ], [ %180, %179 ], !dbg !36 + %181 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i14 = icmp eq i32 %181, 0, !dbg !36 + br i1 %.not.i14, label %184, label %182, !dbg !36 + +182: ; preds = %__nv_rsqrtf.exit13 + %183 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %161), !dbg !36 + br label %__nv_rsqrtf.exit16, !dbg !36 + +184: ; preds = %__nv_rsqrtf.exit13 + %185 = tail call float @llvm.nvvm.rsqrt.approx.f(float %161), !dbg !36 + br label %__nv_rsqrtf.exit16, !dbg !36 + +__nv_rsqrtf.exit16: ; preds = %182, %184 + %.0.i15 = phi float [ %183, %182 ], [ %185, %184 ], !dbg !36 + %186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i17 = icmp eq i32 %186, 0, !dbg !36 + br i1 %.not.i17, label %189, label %187, !dbg !36 + +187: ; preds = %__nv_rsqrtf.exit16 + %188 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %162), !dbg !36 + br label %__nv_rsqrtf.exit19, !dbg !36 + +189: ; preds = %__nv_rsqrtf.exit16 + %190 = tail call float @llvm.nvvm.rsqrt.approx.f(float %162), !dbg !36 + br label %__nv_rsqrtf.exit19, !dbg !36 + +__nv_rsqrtf.exit19: ; preds = %187, %189 + %.0.i18 = phi float [ %188, %187 ], [ %190, %189 ], !dbg !36 + %191 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i20 = icmp eq i32 %191, 0, !dbg !36 + br i1 %.not.i20, label %194, label %192, !dbg !36 + +192: ; preds = %__nv_rsqrtf.exit19 + %193 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %163), !dbg !36 + br label %__nv_rsqrtf.exit22, !dbg !36 + +194: ; preds = %__nv_rsqrtf.exit19 + %195 = tail call float @llvm.nvvm.rsqrt.approx.f(float %163), !dbg !36 + br label %__nv_rsqrtf.exit22, !dbg !36 + +__nv_rsqrtf.exit22: ; preds = %192, %194 + %.0.i21 = phi float [ %193, %192 ], [ %195, %194 ], !dbg !36 + %196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i23 = icmp eq i32 %196, 0, !dbg !36 + br i1 %.not.i23, label %199, label %197, !dbg !36 + +197: ; preds = %__nv_rsqrtf.exit22 + %198 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %164), !dbg !36 + br label %__nv_rsqrtf.exit25, !dbg !36 + +199: ; preds = %__nv_rsqrtf.exit22 + %200 = tail call float @llvm.nvvm.rsqrt.approx.f(float %164), !dbg !36 + br label %__nv_rsqrtf.exit25, !dbg !36 + +__nv_rsqrtf.exit25: ; preds = %197, %199 + %.0.i24 = phi float [ %198, %197 ], [ %200, %199 ], !dbg !36 + %201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36 + %.not.i26 = icmp eq i32 %201, 0, !dbg !36 + br i1 %.not.i26, label %204, label %202, !dbg !36 + +202: ; preds = %__nv_rsqrtf.exit25 + %203 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %165), !dbg !36 + br label %__nv_rsqrtf.exit28, !dbg !36 + +204: ; preds = %__nv_rsqrtf.exit25 + %205 = tail call float @llvm.nvvm.rsqrt.approx.f(float %165), !dbg !36 + br label %__nv_rsqrtf.exit28, !dbg !36 + +__nv_rsqrtf.exit28: ; preds = %202, %204 + %.0.i27 = phi float [ %203, %202 ], [ %205, %204 ], !dbg !36 + %206 = fmul float %.0.i, %120, !dbg !37 + %207 = fmul float %.0.i9, %121, !dbg !37 + %208 = fmul float %.0.i12, %122, !dbg !37 + %209 = fmul float %.0.i15, %123, !dbg !37 + %210 = fmul float %.0.i18, %124, !dbg !37 + %211 = fmul float %.0.i21, %125, !dbg !37 + %212 = fmul float %.0.i24, %126, !dbg !37 + %213 = fmul float %.0.i27, %127, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %214 = shl nuw nsw i32 %21, 3, !dbg !37 + %215 = and i32 %18, 8, !dbg !37 + %216 = icmp eq i32 %215, 0, !dbg !37 + %217 = select i1 %216, i32 0, i32 1088, !dbg !37 + %218 = and i32 %18, 16, !dbg !37 + %219 = icmp eq i32 %218, 0, !dbg !37 + %220 = select i1 %219, i32 0, i32 2052, !dbg !37 + %221 = shl nuw nsw i32 %90, 2, !dbg !37 + %222 = or disjoint i32 %220, %221, !dbg !37 + %223 = or disjoint i32 %217, %214, !dbg !37 + %224 = xor i32 %223, %92, !dbg !37 + %225 = or disjoint i32 %224, %222, !dbg !37 + %226 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %225, !dbg !37 + store float %206, ptr addrspace(3) %226, align 4, !dbg !37 + %227 = xor i32 %225, 272, !dbg !37 + %228 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %227, !dbg !37 + store float %207, ptr addrspace(3) %228, align 4, !dbg !37 + %229 = xor i32 %225, 544, !dbg !37 + %230 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %229, !dbg !37 + store float %208, ptr addrspace(3) %230, align 4, !dbg !37 + %231 = xor i32 %225, 816, !dbg !37 + %232 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %231, !dbg !37 + store float %209, ptr addrspace(3) %232, align 4, !dbg !37 + %233 = xor i32 %225, 4, !dbg !37 + %234 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %233, !dbg !37 + store float %210, ptr addrspace(3) %234, align 4, !dbg !37 + %235 = xor i32 %225, 276, !dbg !37 + %236 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %235, !dbg !37 + store float %211, ptr addrspace(3) %236, align 4, !dbg !37 + %237 = xor i32 %225, 548, !dbg !37 + %238 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %237, !dbg !37 + store float %212, ptr addrspace(3) %238, align 4, !dbg !37 + %239 = xor i32 %225, 820, !dbg !37 + %240 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %239, !dbg !37 + store float %213, ptr addrspace(3) %240, align 4, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %241 = and i32 %86, 832, !dbg !37 + %242 = shl nuw nsw i32 %65, 2, !dbg !37 + %243 = lshr exact i32 %67, 1, !dbg !37 + %244 = shl nuw nsw i32 %18, 1, !dbg !37 + %245 = and i32 %244, 4, !dbg !37 + %246 = or disjoint i32 %241, %242, !dbg !37 + %247 = xor i32 %246, %243, !dbg !37 + %248 = or disjoint i32 %247, %245, !dbg !37 + %249 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %248, !dbg !37 + %250 = load float, ptr addrspace(3) %249, align 4, !dbg !37 + %251 = getelementptr inbounds nuw i8, ptr addrspace(3) %249, i32 128, !dbg !37 + %252 = load float, ptr addrspace(3) %251, align 4, !dbg !37 + %253 = xor i32 %248, 1088, !dbg !37 + %254 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %253, !dbg !37 + %255 = load float, ptr addrspace(3) %254, align 4, !dbg !37 + %256 = getelementptr inbounds nuw i8, ptr addrspace(3) %254, i32 128, !dbg !37 + %257 = load float, ptr addrspace(3) %256, align 4, !dbg !37 + %258 = xor i32 %248, 2052, !dbg !37 + %259 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %258, !dbg !37 + %260 = load float, ptr addrspace(3) %259, align 4, !dbg !37 + %261 = getelementptr inbounds nuw i8, ptr addrspace(3) %259, i32 128, !dbg !37 + %262 = load float, ptr addrspace(3) %261, align 4, !dbg !37 + %263 = xor i32 %248, 3140, !dbg !37 + %264 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %263, !dbg !37 + %265 = load float, ptr addrspace(3) %264, align 4, !dbg !37 + %266 = getelementptr inbounds nuw i8, ptr addrspace(3) %264, i32 128, !dbg !37 + %267 = load float, ptr addrspace(3) %266, align 4, !dbg !37 + %268 = sext i32 %31 to i64, !dbg !38 + %269 = getelementptr bfloat, ptr addrspace(1) %2, i64 %268, !dbg !38 + %270 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !39 + %271 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %269, i64 %270, i1 %47) #5, !dbg !39 + %272 = add i32 %44, -3145728, !dbg !40 + %273 = sext i32 %272 to i64, !dbg !41 + %274 = getelementptr bfloat, ptr addrspace(1) %3, i64 %273, !dbg !41 + %275 = add i32 %17, -8192, !dbg !42 + %276 = icmp ult i32 %275, 65536, !dbg !42 + %277 = and i1 %33, %276, !dbg !42 + %278 = and i1 %34, %276, !dbg !42 + %279 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !43 + %280 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %274, i64 %279, i1 %277) #5, !dbg !43 + %281 = extractvalue { i32, i32, i32, i32 } %280, 0, !dbg !43 + %282 = extractvalue { i32, i32, i32, i32 } %280, 1, !dbg !43 + %283 = extractvalue { i32, i32, i32, i32 } %280, 2, !dbg !43 + %284 = extractvalue { i32, i32, i32, i32 } %280, 3, !dbg !43 + %285 = insertelement <2 x i32> poison, i32 %281, i64 0, !dbg !43 + %286 = insertelement <2 x i32> %285, i32 %283, i64 1, !dbg !43 + %287 = lshr <2 x i32> %286, splat (i32 16), !dbg !43 + %288 = trunc nuw <2 x i32> %287 to <2 x i16>, !dbg !43 + %289 = insertelement <2 x i32> poison, i32 %282, i64 0, !dbg !43 + %290 = insertelement <2 x i32> %289, i32 %284, i64 1, !dbg !43 + %291 = lshr <2 x i32> %290, splat (i32 16), !dbg !43 + %292 = trunc nuw <2 x i32> %291 to <2 x i16>, !dbg !43 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44 + %293 = trunc i32 %281 to i16, !dbg !44 + %294 = trunc i32 %283 to i16, !dbg !44 + %295 = insertelement <2 x i16> poison, i16 %293, i64 0, !dbg !44 + %296 = insertelement <2 x i16> %295, i16 %294, i64 1, !dbg !44 + store <2 x i16> %296, ptr addrspace(3) %71, align 4, !dbg !44 + store <2 x i16> %288, ptr addrspace(3) %77, align 4, !dbg !44 + %297 = trunc i32 %282 to i16, !dbg !44 + %298 = trunc i32 %284 to i16, !dbg !44 + %299 = insertelement <2 x i16> poison, i16 %297, i64 0, !dbg !44 + %300 = insertelement <2 x i16> %299, i16 %298, i64 1, !dbg !44 + store <2 x i16> %300, ptr addrspace(3) %79, align 4, !dbg !44 + store <2 x i16> %292, ptr addrspace(3) %85, align 4, !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44 + %301 = load bfloat, ptr addrspace(3) %97, align 2, !dbg !44 + %302 = load bfloat, ptr addrspace(3) %100, align 2, !dbg !44 + %303 = load bfloat, ptr addrspace(3) %103, align 2, !dbg !44 + %304 = load bfloat, ptr addrspace(3) %106, align 2, !dbg !44 + %305 = load bfloat, ptr addrspace(3) %109, align 2, !dbg !44 + %306 = load bfloat, ptr addrspace(3) %112, align 2, !dbg !44 + %307 = load bfloat, ptr addrspace(3) %115, align 2, !dbg !44 + %308 = load bfloat, ptr addrspace(3) %118, align 2, !dbg !44 + %309 = shl nsw i32 %36, 5, !dbg !45 + %310 = add nsw i32 %.decomposed53, -8192, !dbg !45 + %311 = add i32 %310, %309, !dbg !46 + %312 = sext i32 %311 to i64, !dbg !47 + %313 = getelementptr float, ptr addrspace(1) %4, i64 %312, !dbg !47 + %314 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !48 + %315 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %313, i64 %314, i1 %278) #5, !dbg !48 + %316 = extractvalue { i32, i32, i32, i32 } %315, 0, !dbg !48 + %317 = extractvalue { i32, i32, i32, i32 } %315, 1, !dbg !48 + %318 = extractvalue { i32, i32, i32, i32 } %315, 2, !dbg !48 + %319 = extractvalue { i32, i32, i32, i32 } %315, 3, !dbg !48 + %320 = bitcast i32 %316 to float, !dbg !48 + %321 = bitcast i32 %317 to float, !dbg !48 + %322 = bitcast i32 %318 to float, !dbg !48 + %323 = bitcast i32 %319 to float, !dbg !48 + %324 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !48 + %325 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %313, i64 %324, i1 %278) #5, !dbg !48 + %326 = extractvalue { i32, i32, i32, i32 } %325, 0, !dbg !48 + %327 = extractvalue { i32, i32, i32, i32 } %325, 1, !dbg !48 + %328 = extractvalue { i32, i32, i32, i32 } %325, 2, !dbg !48 + %329 = extractvalue { i32, i32, i32, i32 } %325, 3, !dbg !48 + %330 = bitcast i32 %326 to float, !dbg !48 + %331 = bitcast i32 %327 to float, !dbg !48 + %332 = bitcast i32 %328 to float, !dbg !48 + %333 = bitcast i32 %329 to float, !dbg !48 + %334 = tail call float @llvm.nvvm.div.full(float %320, float 1.280000e+02), !dbg !49 + %335 = tail call float @llvm.nvvm.div.full(float %321, float 1.280000e+02), !dbg !49 + %336 = tail call float @llvm.nvvm.div.full(float %322, float 1.280000e+02), !dbg !49 + %337 = tail call float @llvm.nvvm.div.full(float %323, float 1.280000e+02), !dbg !49 + %338 = tail call float @llvm.nvvm.div.full(float %330, float 1.280000e+02), !dbg !49 + %339 = tail call float @llvm.nvvm.div.full(float %331, float 1.280000e+02), !dbg !49 + %340 = tail call float @llvm.nvvm.div.full(float %332, float 1.280000e+02), !dbg !49 + %341 = tail call float @llvm.nvvm.div.full(float %333, float 1.280000e+02), !dbg !49 + %342 = fadd float %334, 0x3EB0C6F7A0000000, !dbg !50 + %343 = fadd float %335, 0x3EB0C6F7A0000000, !dbg !50 + %344 = fadd float %336, 0x3EB0C6F7A0000000, !dbg !50 + %345 = fadd float %337, 0x3EB0C6F7A0000000, !dbg !50 + %346 = fadd float %338, 0x3EB0C6F7A0000000, !dbg !50 + %347 = fadd float %339, 0x3EB0C6F7A0000000, !dbg !50 + %348 = fadd float %340, 0x3EB0C6F7A0000000, !dbg !50 + %349 = fadd float %341, 0x3EB0C6F7A0000000, !dbg !50 + %350 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i29 = icmp eq i32 %350, 0, !dbg !51 + br i1 %.not.i29, label %353, label %351, !dbg !51 + +351: ; preds = %__nv_rsqrtf.exit28 + %352 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %342), !dbg !51 + br label %__nv_rsqrtf.exit31, !dbg !51 + +353: ; preds = %__nv_rsqrtf.exit28 + %354 = tail call float @llvm.nvvm.rsqrt.approx.f(float %342), !dbg !51 + br label %__nv_rsqrtf.exit31, !dbg !51 + +__nv_rsqrtf.exit31: ; preds = %351, %353 + %.0.i30 = phi float [ %352, %351 ], [ %354, %353 ], !dbg !51 + %355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i32 = icmp eq i32 %355, 0, !dbg !51 + br i1 %.not.i32, label %358, label %356, !dbg !51 + +356: ; preds = %__nv_rsqrtf.exit31 + %357 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %343), !dbg !51 + br label %__nv_rsqrtf.exit34, !dbg !51 + +358: ; preds = %__nv_rsqrtf.exit31 + %359 = tail call float @llvm.nvvm.rsqrt.approx.f(float %343), !dbg !51 + br label %__nv_rsqrtf.exit34, !dbg !51 + +__nv_rsqrtf.exit34: ; preds = %356, %358 + %.0.i33 = phi float [ %357, %356 ], [ %359, %358 ], !dbg !51 + %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i35 = icmp eq i32 %360, 0, !dbg !51 + br i1 %.not.i35, label %363, label %361, !dbg !51 + +361: ; preds = %__nv_rsqrtf.exit34 + %362 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %344), !dbg !51 + br label %__nv_rsqrtf.exit37, !dbg !51 + +363: ; preds = %__nv_rsqrtf.exit34 + %364 = tail call float @llvm.nvvm.rsqrt.approx.f(float %344), !dbg !51 + br label %__nv_rsqrtf.exit37, !dbg !51 + +__nv_rsqrtf.exit37: ; preds = %361, %363 + %.0.i36 = phi float [ %362, %361 ], [ %364, %363 ], !dbg !51 + %365 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i38 = icmp eq i32 %365, 0, !dbg !51 + br i1 %.not.i38, label %368, label %366, !dbg !51 + +366: ; preds = %__nv_rsqrtf.exit37 + %367 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %345), !dbg !51 + br label %__nv_rsqrtf.exit40, !dbg !51 + +368: ; preds = %__nv_rsqrtf.exit37 + %369 = tail call float @llvm.nvvm.rsqrt.approx.f(float %345), !dbg !51 + br label %__nv_rsqrtf.exit40, !dbg !51 + +__nv_rsqrtf.exit40: ; preds = %366, %368 + %.0.i39 = phi float [ %367, %366 ], [ %369, %368 ], !dbg !51 + %370 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i41 = icmp eq i32 %370, 0, !dbg !51 + br i1 %.not.i41, label %373, label %371, !dbg !51 + +371: ; preds = %__nv_rsqrtf.exit40 + %372 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %346), !dbg !51 + br label %__nv_rsqrtf.exit43, !dbg !51 + +373: ; preds = %__nv_rsqrtf.exit40 + %374 = tail call float @llvm.nvvm.rsqrt.approx.f(float %346), !dbg !51 + br label %__nv_rsqrtf.exit43, !dbg !51 + +__nv_rsqrtf.exit43: ; preds = %371, %373 + %.0.i42 = phi float [ %372, %371 ], [ %374, %373 ], !dbg !51 + %375 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i44 = icmp eq i32 %375, 0, !dbg !51 + br i1 %.not.i44, label %378, label %376, !dbg !51 + +376: ; preds = %__nv_rsqrtf.exit43 + %377 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %347), !dbg !51 + br label %__nv_rsqrtf.exit46, !dbg !51 + +378: ; preds = %__nv_rsqrtf.exit43 + %379 = tail call float @llvm.nvvm.rsqrt.approx.f(float %347), !dbg !51 + br label %__nv_rsqrtf.exit46, !dbg !51 + +__nv_rsqrtf.exit46: ; preds = %376, %378 + %.0.i45 = phi float [ %377, %376 ], [ %379, %378 ], !dbg !51 + %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i47 = icmp eq i32 %380, 0, !dbg !51 + br i1 %.not.i47, label %383, label %381, !dbg !51 + +381: ; preds = %__nv_rsqrtf.exit46 + %382 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %348), !dbg !51 + br label %__nv_rsqrtf.exit49, !dbg !51 + +383: ; preds = %__nv_rsqrtf.exit46 + %384 = tail call float @llvm.nvvm.rsqrt.approx.f(float %348), !dbg !51 + br label %__nv_rsqrtf.exit49, !dbg !51 + +__nv_rsqrtf.exit49: ; preds = %381, %383 + %.0.i48 = phi float [ %382, %381 ], [ %384, %383 ], !dbg !51 + %385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51 + %.not.i50 = icmp eq i32 %385, 0, !dbg !51 + br i1 %.not.i50, label %388, label %386, !dbg !51 + +386: ; preds = %__nv_rsqrtf.exit49 + %387 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %349), !dbg !51 + br label %__nv_rsqrtf.exit52, !dbg !51 + +388: ; preds = %__nv_rsqrtf.exit49 + %389 = tail call float @llvm.nvvm.rsqrt.approx.f(float %349), !dbg !51 + br label %__nv_rsqrtf.exit52, !dbg !51 + +__nv_rsqrtf.exit52: ; preds = %386, %388 + %.0.i51 = phi float [ %387, %386 ], [ %389, %388 ], !dbg !51 + %390 = fpext bfloat %308 to float, !dbg !44 + %391 = fpext bfloat %307 to float, !dbg !44 + %392 = fpext bfloat %306 to float, !dbg !44 + %393 = fpext bfloat %305 to float, !dbg !44 + %394 = fpext bfloat %304 to float, !dbg !44 + %395 = fpext bfloat %303 to float, !dbg !44 + %396 = fpext bfloat %302 to float, !dbg !44 + %397 = fpext bfloat %301 to float, !dbg !44 + %398 = extractvalue { i32, i32, i32, i32 } %271, 3, !dbg !39 + %399 = bitcast i32 %398 to <2 x bfloat>, !dbg !39 + %400 = extractvalue { i32, i32, i32, i32 } %271, 2, !dbg !39 + %401 = bitcast i32 %400 to <2 x bfloat>, !dbg !39 + %402 = extractvalue { i32, i32, i32, i32 } %271, 1, !dbg !39 + %403 = bitcast i32 %402 to <2 x bfloat>, !dbg !39 + %404 = extractvalue { i32, i32, i32, i32 } %271, 0, !dbg !39 + %405 = bitcast i32 %404 to <2 x bfloat>, !dbg !39 + %406 = icmp slt i32 %23, 73728, !dbg !52 + %407 = fmul float %.0.i30, %397, !dbg !53 + %408 = fmul float %.0.i33, %396, !dbg !53 + %409 = fmul float %.0.i36, %395, !dbg !53 + %410 = fmul float %.0.i39, %394, !dbg !53 + %411 = fmul float %.0.i42, %393, !dbg !53 + %412 = fmul float %.0.i45, %392, !dbg !53 + %413 = fmul float %.0.i48, %391, !dbg !53 + %414 = fmul float %.0.i51, %390, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + store float %407, ptr addrspace(3) %226, align 4, !dbg !53 + store float %408, ptr addrspace(3) %228, align 4, !dbg !53 + store float %409, ptr addrspace(3) %230, align 4, !dbg !53 + store float %410, ptr addrspace(3) %232, align 4, !dbg !53 + store float %411, ptr addrspace(3) %234, align 4, !dbg !53 + store float %412, ptr addrspace(3) %236, align 4, !dbg !53 + store float %413, ptr addrspace(3) %238, align 4, !dbg !53 + store float %414, ptr addrspace(3) %240, align 4, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %415 = load float, ptr addrspace(3) %249, align 4, !dbg !53 + %416 = load float, ptr addrspace(3) %251, align 4, !dbg !53 + %417 = load float, ptr addrspace(3) %254, align 4, !dbg !53 + %418 = load float, ptr addrspace(3) %256, align 4, !dbg !53 + %419 = load float, ptr addrspace(3) %259, align 4, !dbg !53 + %420 = load float, ptr addrspace(3) %261, align 4, !dbg !53 + %421 = load float, ptr addrspace(3) %264, align 4, !dbg !53 + %422 = load float, ptr addrspace(3) %266, align 4, !dbg !53 + %423 = getelementptr bfloat, ptr addrspace(1) %5, i64 %268, !dbg !54 + %424 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !55 + %425 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %423, i64 %424, i1 %277) #5, !dbg !55 + %426 = extractvalue { i32, i32, i32, i32 } %425, 0, !dbg !55 + %427 = bitcast i32 %426 to <2 x bfloat>, !dbg !55 + %428 = extractvalue { i32, i32, i32, i32 } %425, 1, !dbg !55 + %429 = bitcast i32 %428 to <2 x bfloat>, !dbg !55 + %430 = extractvalue { i32, i32, i32, i32 } %425, 2, !dbg !55 + %431 = bitcast i32 %430 to <2 x bfloat>, !dbg !55 + %432 = extractvalue { i32, i32, i32, i32 } %425, 3, !dbg !55 + %433 = bitcast i32 %432 to <2 x bfloat>, !dbg !55 + %434 = shl i32 %23, 7, !dbg !56 + %435 = add i32 %434, %31, !dbg !57 + %436 = sext i32 %435 to i64, !dbg !58 + %437 = getelementptr bfloat, ptr addrspace(1) %6, i64 %436, !dbg !58 + %438 = and i1 %33, %406, !dbg !59 + %439 = fpext <2 x bfloat> %405 to <2 x float>, !dbg !60 + %440 = insertelement <2 x float> poison, float %250, i64 0, !dbg !61 + %441 = insertelement <2 x float> %440, float %255, i64 1, !dbg !61 + %442 = fmul <2 x float> %441, %439, !dbg !61 + %443 = fpext <2 x bfloat> %427 to <2 x float>, !dbg !62 + %444 = insertelement <2 x float> poison, float %415, i64 0, !dbg !63 + %445 = insertelement <2 x float> %444, float %417, i64 1, !dbg !63 + %446 = fmul <2 x float> %445, %443, !dbg !63 + %447 = insertelement <2 x i1> poison, i1 %39, i64 0, !dbg !64 + %448 = shufflevector <2 x i1> %447, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !64 + %449 = select <2 x i1> %448, <2 x float> %442, <2 x float> %446, !dbg !64 + %450 = fptrunc <2 x float> %449 to <2 x bfloat>, !dbg !65 + %451 = fpext <2 x bfloat> %403 to <2 x float>, !dbg !60 + %452 = insertelement <2 x float> poison, float %260, i64 0, !dbg !61 + %453 = insertelement <2 x float> %452, float %265, i64 1, !dbg !61 + %454 = fmul <2 x float> %453, %451, !dbg !61 + %455 = fpext <2 x bfloat> %429 to <2 x float>, !dbg !62 + %456 = insertelement <2 x float> poison, float %419, i64 0, !dbg !63 + %457 = insertelement <2 x float> %456, float %421, i64 1, !dbg !63 + %458 = fmul <2 x float> %457, %455, !dbg !63 + %459 = select <2 x i1> %448, <2 x float> %454, <2 x float> %458, !dbg !64 + %460 = fptrunc <2 x float> %459 to <2 x bfloat>, !dbg !65 + %461 = fpext <2 x bfloat> %401 to <2 x float>, !dbg !60 + %462 = insertelement <2 x float> poison, float %252, i64 0, !dbg !61 + %463 = insertelement <2 x float> %462, float %257, i64 1, !dbg !61 + %464 = fmul <2 x float> %463, %461, !dbg !61 + %465 = fpext <2 x bfloat> %431 to <2 x float>, !dbg !62 + %466 = insertelement <2 x float> poison, float %416, i64 0, !dbg !63 + %467 = insertelement <2 x float> %466, float %418, i64 1, !dbg !63 + %468 = fmul <2 x float> %467, %465, !dbg !63 + %469 = select <2 x i1> %448, <2 x float> %464, <2 x float> %468, !dbg !64 + %470 = fptrunc <2 x float> %469 to <2 x bfloat>, !dbg !65 + %471 = fpext <2 x bfloat> %399 to <2 x float>, !dbg !60 + %472 = insertelement <2 x float> poison, float %262, i64 0, !dbg !61 + %473 = insertelement <2 x float> %472, float %267, i64 1, !dbg !61 + %474 = fmul <2 x float> %473, %471, !dbg !61 + %475 = fpext <2 x bfloat> %433 to <2 x float>, !dbg !62 + %476 = insertelement <2 x float> poison, float %420, i64 0, !dbg !63 + %477 = insertelement <2 x float> %476, float %422, i64 1, !dbg !63 + %478 = fmul <2 x float> %477, %475, !dbg !63 + %479 = select <2 x i1> %448, <2 x float> %474, <2 x float> %478, !dbg !64 + %480 = fptrunc <2 x float> %479 to <2 x bfloat>, !dbg !65 + %481 = bitcast <2 x bfloat> %450 to i32, !dbg !65 + %482 = bitcast <2 x bfloat> %460 to i32, !dbg !65 + %483 = bitcast <2 x bfloat> %470 to i32, !dbg !65 + %484 = bitcast <2 x bfloat> %480 to i32, !dbg !65 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %481, i32 %482, i32 %483, i32 %484, ptr addrspace(1) %437, i1 %438) #5, !dbg !65 + ret void, !dbg !66 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #3 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 29, scope: !5) +!9 = !DILocation(line: 21, column: 48, scope: !5) +!10 = !DILocation(line: 21, column: 69, scope: !5) +!11 = !DILocation(line: 21, column: 53, scope: !5) +!12 = !DILocation(line: 21, column: 34, scope: !5) +!13 = !DILocation(line: 21, column: 75, scope: !5) +!14 = !DILocation(line: 22, column: 44, scope: !5) +!15 = !DILocation(line: 22, column: 23, scope: !5) +!16 = !DILocation(line: 24, column: 28, scope: !5) +!17 = !DILocation(line: 24, column: 33, scope: !5) +!18 = !DILocation(line: 25, column: 44, scope: !5) +!19 = !DILocation(line: 25, column: 23, scope: !5) +!20 = !DILocation(line: 26, column: 21, scope: !5) +!21 = !DILocation(line: 27, column: 19, scope: !5) +!22 = !DILocation(line: 29, column: 19, scope: !5) +!23 = !DILocation(line: 35, column: 18, scope: !5) +!24 = !DILocation(line: 36, column: 39, scope: !5) +!25 = !DILocation(line: 36, column: 35, scope: !5) +!26 = !DILocation(line: 36, column: 51, scope: !5) +!27 = !DILocation(line: 36, column: 44, scope: !5) +!28 = !DILocation(line: 36, column: 30, scope: !5) +!29 = !DILocation(line: 36, column: 64, scope: !5) +!30 = !DILocation(line: 36, column: 57, scope: !5) +!31 = !DILocation(line: 36, column: 123, scope: !5) +!32 = !DILocation(line: 38, column: 30, scope: !5) +!33 = !DILocation(line: 38, column: 80, scope: !5) +!34 = !DILocation(line: 40, column: 19, scope: !5) +!35 = !DILocation(line: 42, column: 19, scope: !5) +!36 = !DILocation(line: 43, column: 28, scope: !5) +!37 = !DILocation(line: 44, column: 19, scope: !5) +!38 = !DILocation(line: 45, column: 31, scope: !5) +!39 = !DILocation(line: 45, column: 71, scope: !5) +!40 = !DILocation(line: 54, column: 45, scope: !5) +!41 = !DILocation(line: 54, column: 31, scope: !5) +!42 = !DILocation(line: 54, column: 83, scope: !5) +!43 = !DILocation(line: 54, column: 67, scope: !5) +!44 = !DILocation(line: 54, column: 134, scope: !5) +!45 = !DILocation(line: 56, column: 56, scope: !5) +!46 = !DILocation(line: 56, column: 52, scope: !5) +!47 = !DILocation(line: 56, column: 31, scope: !5) +!48 = !DILocation(line: 56, column: 90, scope: !5) +!49 = !DILocation(line: 58, column: 21, scope: !5) +!50 = !DILocation(line: 60, column: 20, scope: !5) +!51 = !DILocation(line: 61, column: 28, scope: !5) +!52 = !DILocation(line: 23, column: 21, scope: !5) +!53 = !DILocation(line: 62, column: 20, scope: !5) +!54 = !DILocation(line: 63, column: 31, scope: !5) +!55 = !DILocation(line: 63, column: 71, scope: !5) +!56 = !DILocation(line: 70, column: 34, scope: !5) +!57 = !DILocation(line: 70, column: 30, scope: !5) +!58 = !DILocation(line: 70, column: 25, scope: !5) +!59 = !DILocation(line: 70, column: 54, scope: !5) +!60 = !DILocation(line: 45, column: 137, scope: !5) +!61 = !DILocation(line: 47, column: 20, scope: !5) +!62 = !DILocation(line: 63, column: 138, scope: !5) +!63 = !DILocation(line: 65, column: 20, scope: !5) +!64 = !DILocation(line: 0, scope: !5) +!65 = !DILocation(line: 70, column: 46, scope: !5) +!66 = !DILocation(line: 70, column: 4, scope: !5) diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..10806715d04c6a3d4436851dc528408993296919 --- /dev/null +++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx @@ -0,0 +1,796 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_poi_fused__fused_rms_norm_cat_view_2 +.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2( + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10 +) +.reqntid 128 +{ + .reg .pred %p<12>; + .reg .b16 %rs<33>; + .reg .b32 %r<297>; + .reg .b64 %rd<24>; + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd16, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0]; + ld.param.b64 %rd17, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1]; +$L__tmp0: + .loc 1 21 29 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29 + mov.u32 %r38, %ctaid.y; + ld.param.b64 %rd18, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2]; + .loc 1 21 48 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48 + mov.u32 %r39, %ctaid.z; + ld.param.b64 %rd19, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3]; + .loc 1 21 69 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69 + mov.u32 %r40, %nctaid.y; + ld.param.b64 %rd20, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4]; + .loc 1 21 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34 + mad.lo.s32 %r41, %r39, %r40, %r38; + ld.param.b64 %rd21, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5]; + .loc 1 21 75 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75 + shl.b32 %r42, %r41, 5; + ld.param.b64 %rd22, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6]; + .loc 1 22 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44 + mov.u32 %r43, %tid.x; + bfe.u32 %r44, %r43, 2, 5; + and.b32 %r45, %r43, 7; + shl.b32 %r46, %r45, 2; + .loc 1 22 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23 + or.b32 %r47, %r42, %r44; + or.b32 %r48, %r42, %r46; + .loc 1 24 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28 + mov.u32 %r49, %ctaid.x; + .loc 1 24 33 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:33 + shl.b32 %r50, %r49, 5; + .loc 1 25 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:44 + shl.b32 %r51, %r43, 3; + and.b32 %r52, %r51, 24; + bfe.u32 %r53, %r43, 3, 4; + .loc 1 25 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:23 + or.b32 %r54, %r52, %r50; + or.b32 %r55, %r53, %r50; + .loc 1 26 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21 + setp.lt.s32 %p6, %r54, 128; + setp.lt.s32 %p7, %r55, 128; + .loc 1 27 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19 + bfe.s32 %r56, %r41, 26, 1; + shr.u32 %r57, %r56, 27; + add.s32 %r58, %r47, %r57; + shr.u32 %r59, %r58, 5; + .loc 1 29 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19 + and.b32 %r60, %r58, 33554400; + sub.s32 %r61, %r47, %r60; + .loc 1 35 18 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18 + setp.lt.s32 %p8, %r47, 8192; + setp.lt.s32 %p9, %r48, 8192; + .loc 1 36 39 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39 + shl.b32 %r62, %r61, 7; + .loc 1 36 35 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35 + add.s32 %r63, %r62, %r54; + .loc 1 36 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44 + mad.lo.s32 %r64, %r59, 12288, %r63; + .loc 1 36 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30 + mad.wide.s32 %rd1, %r64, 2, %rd16; + .loc 1 36 64 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64 + and.pred %p1, %p6, %p8; + and.pred %p2, %p7, %p9; + .loc 1 36 57 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + prmt.b32 %r65, %r1, %r3, 0x7632U; + prmt.b32 %r66, %r2, %r4, 0x7632U; + .loc 1 36 123 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123 + shl.b32 %r67, %r43, 5; + and.b32 %r68, %r67, 480; + and.b32 %r69, %r43, 12; + shl.b32 %r70, %r69, 1; + and.b32 %r71, %r43, 112; + shr.u32 %r72, %r71, 2; + or.b32 %r73, %r68, %r70; + xor.b32 %r74, %r73, %r72; + mov.b32 %r75, global_smem; + add.s32 %r76, %r75, %r74; + prmt.b32 %r77, %r1, %r3, 0x5410U; + st.shared.b32 [%r76], %r77; + xor.b32 %r78, %r74, 32; + add.s32 %r79, %r75, %r78; + st.shared.b32 [%r79+512], %r65; + xor.b32 %r80, %r74, 64; + add.s32 %r81, %r75, %r80; + prmt.b32 %r82, %r2, %r4, 0x5410U; + st.shared.b32 [%r81+1024], %r82; + xor.b32 %r83, %r74, 96; + add.s32 %r84, %r75, %r83; + st.shared.b32 [%r84+1536], %r66; + bar.sync 0; + shl.b32 %r85, %r43, 6; + and.b32 %r86, %r85, 1536; + shl.b32 %r87, %r43, 2; + and.b32 %r88, %r87, 124; + and.b32 %r89, %r43, 32; + shr.u32 %r90, %r89, 4; + and.b32 %r91, %r43, 64; + shr.u32 %r92, %r91, 1; + or.b32 %r93, %r86, %r88; + xor.b32 %r94, %r93, %r92; + or.b32 %r95, %r94, %r90; + add.s32 %r96, %r75, %r95; + ld.shared.b16 %rs1, [%r96]; + xor.b32 %r97, %r95, 8; + add.s32 %r98, %r75, %r97; + ld.shared.b16 %rs2, [%r98+128]; + xor.b32 %r99, %r95, 16; + add.s32 %r100, %r75, %r99; + ld.shared.b16 %rs3, [%r100+256]; + xor.b32 %r101, %r95, 24; + add.s32 %r102, %r75, %r101; + ld.shared.b16 %rs4, [%r102+384]; + xor.b32 %r103, %r95, 64; + add.s32 %r104, %r75, %r103; + ld.shared.b16 %rs5, [%r104]; + xor.b32 %r105, %r95, 72; + add.s32 %r106, %r75, %r105; + ld.shared.b16 %rs6, [%r106+128]; + xor.b32 %r107, %r95, 80; + add.s32 %r108, %r75, %r107; + ld.shared.b16 %rs7, [%r108+256]; + xor.b32 %r109, %r95, 88; + add.s32 %r110, %r75, %r109; + ld.shared.b16 %rs8, [%r110+384]; + cvt.f32.bf16 %r111, %rs1; + cvt.f32.bf16 %r112, %rs2; + cvt.f32.bf16 %r113, %rs3; + cvt.f32.bf16 %r114, %rs4; + cvt.f32.bf16 %r115, %rs5; + cvt.f32.bf16 %r116, %rs6; + cvt.f32.bf16 %r117, %rs7; + cvt.f32.bf16 %r118, %rs8; + .loc 1 38 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30 + mad.wide.s32 %rd3, %r48, 4, %rd17; + .loc 1 38 80 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4; + // end inline asm + // begin inline asm + mov.u64 %rd5, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd5, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + mov.u32 %r13, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd3 + 0 ], %rd5; + // end inline asm + mov.b32 %r119, 0f43000000; + .loc 1 40 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19 + div.full.f32 %r120, %r6, %r119; + div.full.f32 %r121, %r7, %r119; + div.full.f32 %r122, %r8, %r119; + div.full.f32 %r123, %r9, %r119; + div.full.f32 %r124, %r10, %r119; + div.full.f32 %r125, %r11, %r119; + div.full.f32 %r126, %r12, %r119; + div.full.f32 %r127, %r13, %r119; + .loc 1 42 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19 + add.f32 %r128, %r120, 0f358637BD; + add.f32 %r129, %r121, 0f358637BD; + add.f32 %r130, %r122, 0f358637BD; + add.f32 %r131, %r123, 0f358637BD; + add.f32 %r132, %r124, 0f358637BD; + add.f32 %r133, %r125, 0f358637BD; + add.f32 %r134, %r126, 0f358637BD; + add.f32 %r135, %r127, 0f358637BD; + .loc 1 43 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28 + rsqrt.approx.ftz.f32 %r136, %r128; + rsqrt.approx.ftz.f32 %r137, %r129; + rsqrt.approx.ftz.f32 %r138, %r130; + rsqrt.approx.ftz.f32 %r139, %r131; + rsqrt.approx.ftz.f32 %r140, %r132; + rsqrt.approx.ftz.f32 %r141, %r133; + rsqrt.approx.ftz.f32 %r142, %r134; + rsqrt.approx.ftz.f32 %r143, %r135; + .loc 1 44 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19 + mul.f32 %r144, %r136, %r111; + mul.f32 %r145, %r137, %r112; + mul.f32 %r146, %r138, %r113; + mul.f32 %r147, %r139, %r114; + mul.f32 %r148, %r140, %r115; + mul.f32 %r149, %r141, %r116; + mul.f32 %r150, %r142, %r117; + mul.f32 %r151, %r143, %r118; + bar.sync 0; + shl.b32 %r152, %r45, 3; + bfe.s32 %r153, %r43, 3, 1; + and.b32 %r154, %r153, 1088; + bfe.s32 %r155, %r43, 4, 1; + and.b32 %r156, %r155, 2052; + shl.b32 %r157, %r89, 2; + or.b32 %r158, %r156, %r157; + or.b32 %r159, %r154, %r152; + xor.b32 %r160, %r159, %r91; + or.b32 %r161, %r160, %r158; + add.s32 %r162, %r75, %r161; + st.shared.b32 [%r162], %r144; + xor.b32 %r163, %r161, 16; + add.s32 %r164, %r75, %r163; + st.shared.b32 [%r164+256], %r145; + xor.b32 %r165, %r161, 32; + add.s32 %r166, %r75, %r165; + st.shared.b32 [%r166+512], %r146; + xor.b32 %r167, %r161, 48; + add.s32 %r168, %r75, %r167; + st.shared.b32 [%r168+768], %r147; + xor.b32 %r169, %r161, 4; + add.s32 %r170, %r75, %r169; + st.shared.b32 [%r170], %r148; + xor.b32 %r171, %r161, 20; + add.s32 %r172, %r75, %r171; + st.shared.b32 [%r172+256], %r149; + xor.b32 %r173, %r161, 36; + add.s32 %r174, %r75, %r173; + st.shared.b32 [%r174+512], %r150; + xor.b32 %r175, %r161, 52; + add.s32 %r176, %r75, %r175; + st.shared.b32 [%r176+768], %r151; + bar.sync 0; + and.b32 %r177, %r85, 832; + shl.b32 %r178, %r69, 2; + shr.u32 %r179, %r71, 1; + shl.b32 %r180, %r43, 1; + and.b32 %r181, %r180, 4; + or.b32 %r182, %r177, %r178; + xor.b32 %r183, %r182, %r179; + or.b32 %r184, %r183, %r181; + add.s32 %r185, %r75, %r184; + ld.shared.b32 %r186, [%r185]; + ld.shared.b32 %r187, [%r185+128]; + xor.b32 %r188, %r184, 64; + add.s32 %r189, %r75, %r188; + ld.shared.b32 %r190, [%r189+1024]; + ld.shared.b32 %r191, [%r189+1152]; + xor.b32 %r192, %r184, 4; + add.s32 %r193, %r75, %r192; + ld.shared.b32 %r194, [%r193+2048]; + ld.shared.b32 %r195, [%r193+2176]; + xor.b32 %r196, %r184, 68; + add.s32 %r197, %r75, %r196; + ld.shared.b32 %r198, [%r197+3072]; + ld.shared.b32 %r199, [%r197+3200]; + .loc 1 45 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31 + mul.wide.s32 %rd23, %r54, 2; + add.s64 %rd6, %rd18, %rd23; + .loc 1 45 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71 + // begin inline asm + mov.u64 %rd7, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r14, %r5; + mov.u32 %r15, %r5; + mov.u32 %r16, %r5; + mov.u32 %r17, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd6 + 0 ], %rd7; + // end inline asm + .loc 1 54 45 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45 + add.s32 %r200, %r64, -3145728; + .loc 1 54 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31 + mad.wide.s32 %rd8, %r200, 2, %rd19; + .loc 1 54 83 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83 + add.s32 %r201, %r42, -8192; + setp.lt.u32 %p10, %r201, 65536; + and.pred %p3, %p6, %p10; + and.pred %p4, %p7, %p10; + .loc 1 54 67 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67 + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r18, %r5; + mov.u32 %r19, %r5; + mov.u32 %r20, %r5; + mov.u32 %r21, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd8 + 0 ], %rd9; + // end inline asm + prmt.b32 %r202, %r18, %r20, 0x7632U; + prmt.b32 %r203, %r19, %r21, 0x7632U; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + bar.sync 0; + prmt.b32 %r204, %r18, %r20, 0x5410U; + st.shared.b32 [%r76], %r204; + st.shared.b32 [%r79+512], %r202; + prmt.b32 %r205, %r19, %r21, 0x5410U; + st.shared.b32 [%r81+1024], %r205; + st.shared.b32 [%r84+1536], %r203; + bar.sync 0; + ld.shared.b16 %rs9, [%r96]; + ld.shared.b16 %rs10, [%r98+128]; + ld.shared.b16 %rs11, [%r100+256]; + ld.shared.b16 %rs12, [%r102+384]; + ld.shared.b16 %rs13, [%r104]; + ld.shared.b16 %rs14, [%r106+128]; + ld.shared.b16 %rs15, [%r108+256]; + ld.shared.b16 %rs16, [%r110+384]; + .loc 1 56 52 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52 + add.s32 %r206, %r48, -8192; + .loc 1 56 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31 + mad.wide.s32 %rd10, %r206, 4, %rd20; + .loc 1 56 90 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r22, %r5; + mov.u32 %r23, %r5; + mov.u32 %r24, %r5; + mov.u32 %r25, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r22, %r23, %r24, %r25 }, [ %rd10 + 0 ], %rd11; + // end inline asm + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + mov.u32 %r29, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd10 + 0 ], %rd12; + // end inline asm + .loc 1 58 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21 + div.full.f32 %r207, %r22, %r119; + div.full.f32 %r208, %r23, %r119; + div.full.f32 %r209, %r24, %r119; + div.full.f32 %r210, %r25, %r119; + div.full.f32 %r211, %r26, %r119; + div.full.f32 %r212, %r27, %r119; + div.full.f32 %r213, %r28, %r119; + div.full.f32 %r214, %r29, %r119; + .loc 1 60 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20 + add.f32 %r215, %r207, 0f358637BD; + add.f32 %r216, %r208, 0f358637BD; + add.f32 %r217, %r209, 0f358637BD; + add.f32 %r218, %r210, 0f358637BD; + add.f32 %r219, %r211, 0f358637BD; + add.f32 %r220, %r212, 0f358637BD; + add.f32 %r221, %r213, 0f358637BD; + add.f32 %r222, %r214, 0f358637BD; + .loc 1 61 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28 + rsqrt.approx.ftz.f32 %r223, %r215; + rsqrt.approx.ftz.f32 %r224, %r216; + rsqrt.approx.ftz.f32 %r225, %r217; + rsqrt.approx.ftz.f32 %r226, %r218; + rsqrt.approx.ftz.f32 %r227, %r219; + rsqrt.approx.ftz.f32 %r228, %r220; + rsqrt.approx.ftz.f32 %r229, %r221; + rsqrt.approx.ftz.f32 %r230, %r222; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + cvt.f32.bf16 %r231, %rs16; + cvt.f32.bf16 %r232, %rs15; + cvt.f32.bf16 %r233, %rs14; + cvt.f32.bf16 %r234, %rs13; + cvt.f32.bf16 %r235, %rs12; + cvt.f32.bf16 %r236, %rs11; + cvt.f32.bf16 %r237, %rs10; + cvt.f32.bf16 %r238, %rs9; + .loc 1 23 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21 + setp.lt.s32 %p11, %r47, 73728; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r239, %r223, %r238; + mul.f32 %r240, %r224, %r237; + mul.f32 %r241, %r225, %r236; + mul.f32 %r242, %r226, %r235; + mul.f32 %r243, %r227, %r234; + mul.f32 %r244, %r228, %r233; + mul.f32 %r245, %r229, %r232; + mul.f32 %r246, %r230, %r231; + bar.sync 0; + st.shared.b32 [%r162], %r239; + st.shared.b32 [%r164+256], %r240; + st.shared.b32 [%r166+512], %r241; + st.shared.b32 [%r168+768], %r242; + st.shared.b32 [%r170], %r243; + st.shared.b32 [%r172+256], %r244; + st.shared.b32 [%r174+512], %r245; + st.shared.b32 [%r176+768], %r246; + bar.sync 0; + ld.shared.b32 %r247, [%r185]; + ld.shared.b32 %r248, [%r185+128]; + ld.shared.b32 %r249, [%r189+1024]; + ld.shared.b32 %r250, [%r189+1152]; + ld.shared.b32 %r251, [%r193+2048]; + ld.shared.b32 %r252, [%r193+2176]; + ld.shared.b32 %r253, [%r197+3072]; + ld.shared.b32 %r254, [%r197+3200]; + .loc 1 63 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31 + add.s64 %rd13, %rd21, %rd23; + .loc 1 63 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71 + // begin inline asm + mov.u64 %rd14, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r30, %r5; + mov.u32 %r31, %r5; + mov.u32 %r32, %r5; + mov.u32 %r33, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd13 + 0 ], %rd14; + // end inline asm + .loc 1 70 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34 + shl.b32 %r255, %r47, 7; + .loc 1 70 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30 + add.s32 %r256, %r255, %r54; + .loc 1 70 25 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25 + mad.wide.s32 %rd15, %r256, 2, %rd22; + .loc 1 70 54 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54 + and.pred %p5, %p6, %p11; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs17, %rs18}, %r14; + cvt.f32.bf16 %r257, %rs17; + cvt.f32.bf16 %r258, %rs18; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r259, %r190, %r258; + mul.f32 %r260, %r186, %r257; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs19, %rs20}, %r30; + cvt.f32.bf16 %r261, %rs19; + cvt.f32.bf16 %r262, %rs20; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r263, %r249, %r262; + mul.f32 %r264, %r247, %r261; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r265, %r260, %r264, %p8; + selp.f32 %r266, %r259, %r263, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r34, %r266, %r265; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs21, %rs22}, %r15; + cvt.f32.bf16 %r267, %rs21; + cvt.f32.bf16 %r268, %rs22; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r269, %r198, %r268; + mul.f32 %r270, %r194, %r267; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs23, %rs24}, %r31; + cvt.f32.bf16 %r271, %rs23; + cvt.f32.bf16 %r272, %rs24; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r273, %r253, %r272; + mul.f32 %r274, %r251, %r271; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r275, %r270, %r274, %p8; + selp.f32 %r276, %r269, %r273, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r35, %r276, %r275; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs25, %rs26}, %r16; + cvt.f32.bf16 %r277, %rs25; + cvt.f32.bf16 %r278, %rs26; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r279, %r191, %r278; + mul.f32 %r280, %r187, %r277; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs27, %rs28}, %r32; + cvt.f32.bf16 %r281, %rs27; + cvt.f32.bf16 %r282, %rs28; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r283, %r250, %r282; + mul.f32 %r284, %r248, %r281; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r285, %r280, %r284, %p8; + selp.f32 %r286, %r279, %r283, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r36, %r286, %r285; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs29, %rs30}, %r17; + cvt.f32.bf16 %r287, %rs29; + cvt.f32.bf16 %r288, %rs30; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r289, %r199, %r288; + mul.f32 %r290, %r195, %r287; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs31, %rs32}, %r33; + cvt.f32.bf16 %r291, %rs31; + cvt.f32.bf16 %r292, %rs32; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r293, %r254, %r292; + mul.f32 %r294, %r252, %r291; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r295, %r290, %r294, %p8; + selp.f32 %r296, %r289, %r293, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r37, %r296, %r295; + // begin inline asm + @%p5 st.global.v4.b32 [ %rd15 + 0 ], { %r34, %r35, %r36, %r37 }; + // end inline asm + .loc 1 70 4 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 50 +.b8 104 +.b8 105 +.b8 106 +.b8 51 +.b8 104 +.b8 109 +.b8 108 +.b8 111 +.b8 117 +.b8 109 +.b8 120 +.b8 100 +.b8 109 +.b8 104 +.b8 117 +.b8 101 +.b8 122 +.b8 115 +.b8 121 +.b8 104 +.b8 107 +.b8 109 +.b8 110 +.b8 113 +.b8 103 +.b8 110 +.b8 102 +.b8 97 +.b8 53 +.b8 105 +.b8 118 +.b8 114 +.b8 101 +.b8 50 +.b8 55 +.b8 117 +.b8 111 +.b8 115 +.b8 121 +.b8 109 +.b8 97 +.b8 109 +.b8 51 +.b8 100 +.b8 114 +.b8 55 +.b8 97 +.b8 53 +.b8 120 +.b8 98 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 50 +.b8 104 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source new file mode 100644 index 0000000000000000000000000000000000000000..bedde25ba37a672088b4ad4a355020770a713c28 --- /dev/null +++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source @@ -0,0 +1,415 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc99 = loc("in_ptr0"(#loc)) +#loc100 = loc("in_ptr1"(#loc)) +#loc101 = loc("in_ptr2"(#loc)) +#loc102 = loc("in_ptr3"(#loc)) +#loc103 = loc("in_ptr4"(#loc)) +#loc104 = loc("in_ptr5"(#loc)) +#loc105 = loc("out_ptr0"(#loc)) +#loc106 = loc("ynumel"(#loc)) +#loc107 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %ynumel_0 = arith.constant 73728 : i32 loc(#loc108) + %xnumel_1 = arith.constant 128 : i32 loc(#loc109) + %yoffset = tt.get_program_id y : i32 loc(#loc110) + %yoffset_2 = tt.get_program_id z : i32 loc(#loc111) + %yoffset_3 = tt.get_num_programs y : i32 loc(#loc112) + %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc113) + %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc114) + %yoffset_6 = arith.constant 32 : i32 loc(#loc115) + %yoffset_7 = arith.constant 32 : i32 loc(#loc115) + %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc115) + %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc116) + %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc117) + %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<32x1xi32> loc(#loc118) + %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<32x1xi32> loc(#loc118) + %ymask = arith.constant dense<73728> : tensor<32x1xi32> loc(#loc119) + %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<32x1xi32> loc(#loc119) + %xoffset = tt.get_program_id x : i32 loc(#loc120) + %xoffset_13 = arith.constant 32 : i32 loc(#loc121) + %xoffset_14 = arith.constant 32 : i32 loc(#loc121) + %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc121) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc122) + %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc123) + %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x32xi32> loc(#loc124) + %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x32xi32> loc(#loc124) + %xmask = arith.constant dense<128> : tensor<1x32xi32> loc(#loc125) + %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x32xi32> loc(#loc125) + %y1 = arith.constant 32 : i32 loc(#loc126) + %y1_20 = arith.constant 32 : i32 loc(#loc126) + %y1_21 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc126) + %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<32x1xi32> loc(#loc126) + %y0 = arith.constant 32 : i32 loc(#loc127) + %y0_23 = arith.constant 32 : i32 loc(#loc127) + %y0_24 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc127) + %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<32x1xi32> loc(#loc127) + %tmp1 = arith.constant 0 : i64 loc(#loc128) + %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc128) + %tmp2 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc129) + %tmp2_27 = arith.constant dense<0> : tensor<32x1xi64> loc(#loc129) + %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<32x1xi64> loc(#loc129) + %tmp3 = arith.constant 256 : i64 loc(#loc130) + %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc130) + %tmp4 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc131) + %tmp4_30 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc131) + %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<32x1xi64> loc(#loc131) + %tmp5 = arith.constant 128 : i32 loc(#loc132) + %tmp5_32 = arith.constant 128 : i32 loc(#loc132) + %tmp5_33 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc132) + %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<32x1xi32> loc(#loc132) + %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc133) + %tmp5_36 = tt.broadcast %tmp5_34 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc133) + %tmp5_37 = arith.addi %tmp5_35, %tmp5_36 : tensor<32x32xi32> loc(#loc133) + %tmp5_38 = arith.constant 12288 : i32 loc(#loc134) + %tmp5_39 = arith.constant 12288 : i32 loc(#loc134) + %tmp5_40 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc134) + %tmp5_41 = arith.muli %tmp5_40, %y1_22 : tensor<32x1xi32> loc(#loc134) + %tmp5_42 = tt.broadcast %tmp5_41 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc135) + %tmp5_43 = arith.addi %tmp5_37, %tmp5_42 : tensor<32x32xi32> loc(#loc135) + %tmp5_44 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x32x!tt.ptr> loc(#loc136) + %tmp5_45 = tt.addptr %tmp5_44, %tmp5_43 : tensor<32x32x!tt.ptr>, tensor<32x32xi32> loc(#loc136) + %tmp5_46 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc137) + %tmp5_47 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc137) + %tmp5_48 = arith.andi %tmp5_46, %tmp5_47 : tensor<32x32xi1> loc(#loc137) + %tmp5_49 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc138) + %tmp5_50 = arith.andi %tmp5_48, %tmp5_49 : tensor<32x32xi1> loc(#loc138) + %tmp5_51 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp5_52 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc139) + %tmp5_53 = arith.truncf %tmp5_52 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc139) + %tmp5_54 = tt.load %tmp5_45, %tmp5_50, %tmp5_53 evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc139) + %tmp5_55 = arith.extf %tmp5_54 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc140) + %tmp7 = arith.constant 32 : i32 loc(#loc141) + %tmp7_56 = arith.constant 32 : i32 loc(#loc141) + %tmp7_57 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc141) + %tmp7_58 = arith.muli %tmp7_57, %y1_22 : tensor<32x1xi32> loc(#loc141) + %tmp7_59 = arith.addi %y0_25, %tmp7_58 : tensor<32x1xi32> loc(#loc142) + %tmp7_60 = tt.broadcast %tmp7_59 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc143) + %tmp7_61 = tt.splat %in_ptr1 : !tt.ptr -> tensor<32x32x!tt.ptr> loc(#loc144) + %tmp7_62 = tt.addptr %tmp7_61, %tmp7_60 : tensor<32x32x!tt.ptr>, tensor<32x32xi32> loc(#loc144) + %tmp7_63 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc145) + %tmp7_64 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc145) + %tmp7_65 = arith.andi %tmp7_63, %tmp7_64 : tensor<32x32xi1> loc(#loc145) + %tmp7_66 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc146) + %tmp7_67 = arith.andi %tmp7_65, %tmp7_66 : tensor<32x32xi1> loc(#loc146) + %tmp7_68 = arith.constant 0.000000e+00 : f32 loc(#loc147) + %tmp7_69 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc147) + %tmp7_70 = tt.load %tmp7_62, %tmp7_67, %tmp7_69 evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc147) + %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc148) + %tmp9 = arith.constant dense<1.280000e+02> : tensor<32x32xf32> loc(#loc149) + %tmp9_71 = arith.divf %tmp7_70, %tmp9 : tensor<32x32xf32> loc(#loc149) + %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc150) + %tmp11 = arith.constant dense<9.99999997E-7> : tensor<32x32xf32> loc(#loc151) + %tmp11_72 = arith.addf %tmp9_71, %tmp11 : tensor<32x32xf32> loc(#loc151) + %tmp12 = tt.extern_elementwise %tmp11_72 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32>) -> tensor<32x32xf32> loc(#loc152) + %tmp13 = arith.mulf %tmp5_55, %tmp12 : tensor<32x32xf32> loc(#loc153) + %tmp14 = tt.broadcast %xindex_18 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc154) + %tmp14_73 = tt.splat %in_ptr2 : !tt.ptr -> tensor<32x32x!tt.ptr> loc(#loc155) + %tmp14_74 = tt.addptr %tmp14_73, %tmp14 : tensor<32x32x!tt.ptr>, tensor<32x32xi32> loc(#loc155) + %tmp14_75 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc156) + %tmp14_76 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc156) + %tmp14_77 = arith.andi %tmp14_75, %tmp14_76 : tensor<32x32xi1> loc(#loc156) + %tmp14_78 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc157) + %tmp14_79 = arith.andi %tmp14_77, %tmp14_78 : tensor<32x32xi1> loc(#loc157) + %tmp14_80 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp14_81 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc158) + %tmp14_82 = arith.truncf %tmp14_81 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc158) + %tmp14_83 = tt.load %tmp14_74, %tmp14_79, %tmp14_82 evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc158) + %tmp14_84 = arith.extf %tmp14_83 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc159) + %tmp16 = arith.mulf %tmp13, %tmp14_84 : tensor<32x32xf32> loc(#loc160) + %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc161) + %tmp18_85 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc161) + %tmp19 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc162) + %tmp19_86 = arith.select %tmp19, %tmp16, %tmp18_85 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc162) + %tmp20 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc163) + %tmp20_87 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc163) + %tmp20_88 = arith.cmpi sge, %tmp20, %tmp20_87 : tensor<32x1xi64> loc(#loc163) + %tmp21 = arith.constant 2304 : i64 loc(#loc164) + %tmp21_89 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc164) + %tmp22 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc165) + %tmp22_90 = arith.constant dense<2304> : tensor<32x1xi64> loc(#loc165) + %tmp22_91 = arith.cmpi slt, %tmp22, %tmp22_90 : tensor<32x1xi64> loc(#loc165) + %tmp23 = arith.constant 128 : i32 loc(#loc166) + %tmp23_92 = arith.constant 128 : i32 loc(#loc166) + %tmp23_93 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc166) + %tmp23_94 = arith.muli %tmp23_93, %y0_25 : tensor<32x1xi32> loc(#loc166) + %tmp23_95 = tt.broadcast %xindex_18 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc167) + %tmp23_96 = tt.broadcast %tmp23_94 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc167) + %tmp23_97 = arith.addi %tmp23_95, %tmp23_96 : tensor<32x32xi32> loc(#loc167) + %tmp23_98 = arith.constant -256 : i32 loc(#loc168) + %tmp23_99 = arith.constant -256 : i32 loc(#loc168) + %tmp23_100 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc168) + %tmp23_101 = arith.addi %tmp23_100, %y1_22 : tensor<32x1xi32> loc(#loc168) + %tmp23_102 = arith.constant 12288 : i32 loc(#loc169) + %tmp23_103 = arith.constant 12288 : i32 loc(#loc169) + %tmp23_104 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc169) + %tmp23_105 = arith.muli %tmp23_104, %tmp23_101 : tensor<32x1xi32> loc(#loc169) + %tmp23_106 = tt.broadcast %tmp23_105 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc170) + %tmp23_107 = arith.addi %tmp23_97, %tmp23_106 : tensor<32x32xi32> loc(#loc170) + %tmp23_108 = tt.splat %in_ptr3 : !tt.ptr -> tensor<32x32x!tt.ptr> loc(#loc171) + %tmp23_109 = tt.addptr %tmp23_108, %tmp23_107 : tensor<32x32x!tt.ptr>, tensor<32x32xi32> loc(#loc171) + %tmp23_110 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc172) + %tmp23_111 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc172) + %tmp23_112 = arith.andi %tmp23_110, %tmp23_111 : tensor<32x32xi1> loc(#loc172) + %tmp23_113 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc173) + %tmp23_114 = arith.andi %tmp23_112, %tmp23_113 : tensor<32x32xi1> loc(#loc173) + %tmp23_115 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %tmp23_116 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc174) + %tmp23_117 = arith.truncf %tmp23_116 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc174) + %tmp23_118 = tt.load %tmp23_109, %tmp23_114, %tmp23_117 evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc174) + %tmp23_119 = arith.extf %tmp23_118 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc175) + %tmp25 = arith.constant -256 : i32 loc(#loc176) + %tmp25_120 = arith.constant -256 : i32 loc(#loc176) + %tmp25_121 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc176) + %tmp25_122 = arith.addi %tmp25_121, %y1_22 : tensor<32x1xi32> loc(#loc176) + %tmp25_123 = arith.constant 32 : i32 loc(#loc177) + %tmp25_124 = arith.constant 32 : i32 loc(#loc177) + %tmp25_125 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc177) + %tmp25_126 = arith.muli %tmp25_125, %tmp25_122 : tensor<32x1xi32> loc(#loc177) + %tmp25_127 = arith.addi %y0_25, %tmp25_126 : tensor<32x1xi32> loc(#loc178) + %tmp25_128 = tt.broadcast %tmp25_127 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc179) + %tmp25_129 = tt.splat %in_ptr4 : !tt.ptr -> tensor<32x32x!tt.ptr> loc(#loc180) + %tmp25_130 = tt.addptr %tmp25_129, %tmp25_128 : tensor<32x32x!tt.ptr>, tensor<32x32xi32> loc(#loc180) + %tmp25_131 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc181) + %tmp25_132 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc181) + %tmp25_133 = arith.andi %tmp25_131, %tmp25_132 : tensor<32x32xi1> loc(#loc181) + %tmp25_134 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc182) + %tmp25_135 = arith.andi %tmp25_133, %tmp25_134 : tensor<32x32xi1> loc(#loc182) + %tmp25_136 = arith.constant 0.000000e+00 : f32 loc(#loc183) + %tmp25_137 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc183) + %tmp25_138 = tt.load %tmp25_130, %tmp25_135, %tmp25_137 evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc183) + %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc184) + %tmp27 = arith.constant dense<1.280000e+02> : tensor<32x32xf32> loc(#loc185) + %tmp27_139 = arith.divf %tmp25_138, %tmp27 : tensor<32x32xf32> loc(#loc185) + %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc186) + %tmp29 = arith.constant dense<9.99999997E-7> : tensor<32x32xf32> loc(#loc187) + %tmp29_140 = arith.addf %tmp27_139, %tmp29 : tensor<32x32xf32> loc(#loc187) + %tmp30 = tt.extern_elementwise %tmp29_140 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32>) -> tensor<32x32xf32> loc(#loc188) + %tmp31 = arith.mulf %tmp23_119, %tmp30 : tensor<32x32xf32> loc(#loc189) + %tmp32 = tt.broadcast %xindex_18 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc190) + %tmp32_141 = tt.splat %in_ptr5 : !tt.ptr -> tensor<32x32x!tt.ptr> loc(#loc191) + %tmp32_142 = tt.addptr %tmp32_141, %tmp32 : tensor<32x32x!tt.ptr>, tensor<32x32xi32> loc(#loc191) + %tmp32_143 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc192) + %tmp32_144 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc192) + %tmp32_145 = arith.andi %tmp32_143, %tmp32_144 : tensor<32x32xi1> loc(#loc192) + %tmp32_146 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc193) + %tmp32_147 = arith.andi %tmp32_145, %tmp32_146 : tensor<32x32xi1> loc(#loc193) + %tmp32_148 = arith.constant 0.000000e+00 : f32 loc(#loc194) + %tmp32_149 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc194) + %tmp32_150 = arith.truncf %tmp32_149 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc194) + %tmp32_151 = tt.load %tmp32_142, %tmp32_147, %tmp32_150 evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc194) + %tmp32_152 = arith.extf %tmp32_151 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc195) + %tmp34 = arith.mulf %tmp31, %tmp32_152 : tensor<32x32xf32> loc(#loc196) + %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc197) + %tmp36_153 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc197) + %tmp37 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc198) + %tmp37_154 = arith.select %tmp37, %tmp34, %tmp36_153 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc198) + %tmp38 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc199) + %tmp38_155 = arith.select %tmp38, %tmp19_86, %tmp37_154 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc199) + %c128_i32 = arith.constant 128 : i32 loc(#loc93) + %c128_i32_156 = arith.constant 128 : i32 loc(#loc93) + %cst = arith.constant dense<128> : tensor<32x1xi32> loc(#loc93) + %0 = arith.muli %cst, %yindex_11 : tensor<32x1xi32> loc(#loc93) + %1 = tt.broadcast %xindex_18 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc94) + %2 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc94) + %3 = arith.addi %1, %2 : tensor<32x32xi32> loc(#loc94) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<32x32x!tt.ptr> loc(#loc95) + %5 = tt.addptr %4, %3 : tensor<32x32x!tt.ptr>, tensor<32x32xi32> loc(#loc95) + %6 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc96) + %7 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc96) + %8 = arith.andi %6, %7 : tensor<32x32xi1> loc(#loc96) + %9 = arith.truncf %tmp38_155 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc97) + tt.store %5, %9, %8 : tensor<32x32x!tt.ptr> loc(#loc97) + tt.return loc(#loc98) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:60) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:70) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc108 = loc("ynumel"(#loc1)) +#loc109 = loc("xnumel"(#loc2)) +#loc110 = loc("yoffset"(#loc3)) +#loc111 = loc("yoffset"(#loc4)) +#loc112 = loc("yoffset"(#loc5)) +#loc113 = loc("yoffset"(#loc6)) +#loc114 = loc("yoffset"(#loc7)) +#loc115 = loc("yoffset"(#loc8)) +#loc116 = loc("yindex"(#loc9)) +#loc117 = loc("yindex"(#loc10)) +#loc118 = loc("yindex"(#loc11)) +#loc119 = loc("ymask"(#loc12)) +#loc120 = loc("xoffset"(#loc13)) +#loc121 = loc("xoffset"(#loc14)) +#loc122 = loc("xindex"(#loc15)) +#loc123 = loc("xindex"(#loc16)) +#loc124 = loc("xindex"(#loc17)) +#loc125 = loc("xmask"(#loc18)) +#loc126 = loc("y1"(#loc19)) +#loc127 = loc("y0"(#loc20)) +#loc128 = loc("tmp1"(#loc21)) +#loc129 = loc("tmp2"(#loc22)) +#loc130 = loc("tmp3"(#loc23)) +#loc131 = loc("tmp4"(#loc24)) +#loc132 = loc("tmp5"(#loc25)) +#loc133 = loc("tmp5"(#loc26)) +#loc134 = loc("tmp5"(#loc27)) +#loc135 = loc("tmp5"(#loc28)) +#loc136 = loc("tmp5"(#loc29)) +#loc137 = loc("tmp5"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp7"(#loc34)) +#loc142 = loc("tmp7"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp7"(#loc37)) +#loc145 = loc("tmp7"(#loc38)) +#loc146 = loc("tmp7"(#loc39)) +#loc147 = loc("tmp7"(#loc40)) +#loc148 = loc("tmp8"(#loc41)) +#loc149 = loc("tmp9"(#loc42)) +#loc150 = loc("tmp10"(#loc43)) +#loc151 = loc("tmp11"(#loc44)) +#loc152 = loc("tmp12"(#loc45)) +#loc153 = loc("tmp13"(#loc46)) +#loc154 = loc("tmp14"(#loc47)) +#loc155 = loc("tmp14"(#loc48)) +#loc156 = loc("tmp14"(#loc49)) +#loc157 = loc("tmp14"(#loc50)) +#loc158 = loc("tmp14"(#loc51)) +#loc159 = loc("tmp14"(#loc52)) +#loc160 = loc("tmp16"(#loc53)) +#loc161 = loc("tmp18"(#loc54)) +#loc162 = loc("tmp19"(#loc55)) +#loc163 = loc("tmp20"(#loc56)) +#loc164 = loc("tmp21"(#loc57)) +#loc165 = loc("tmp22"(#loc58)) +#loc166 = loc("tmp23"(#loc59)) +#loc167 = loc("tmp23"(#loc60)) +#loc168 = loc("tmp23"(#loc61)) +#loc169 = loc("tmp23"(#loc62)) +#loc170 = loc("tmp23"(#loc63)) +#loc171 = loc("tmp23"(#loc64)) +#loc172 = loc("tmp23"(#loc65)) +#loc173 = loc("tmp23"(#loc66)) +#loc174 = loc("tmp23"(#loc67)) +#loc175 = loc("tmp23"(#loc68)) +#loc176 = loc("tmp25"(#loc69)) +#loc177 = loc("tmp25"(#loc70)) +#loc178 = loc("tmp25"(#loc71)) +#loc179 = loc("tmp25"(#loc72)) +#loc180 = loc("tmp25"(#loc73)) +#loc181 = loc("tmp25"(#loc74)) +#loc182 = loc("tmp25"(#loc75)) +#loc183 = loc("tmp25"(#loc76)) +#loc184 = loc("tmp26"(#loc77)) +#loc185 = loc("tmp27"(#loc78)) +#loc186 = loc("tmp28"(#loc79)) +#loc187 = loc("tmp29"(#loc80)) +#loc188 = loc("tmp30"(#loc81)) +#loc189 = loc("tmp31"(#loc82)) +#loc190 = loc("tmp32"(#loc83)) +#loc191 = loc("tmp32"(#loc84)) +#loc192 = loc("tmp32"(#loc85)) +#loc193 = loc("tmp32"(#loc86)) +#loc194 = loc("tmp32"(#loc87)) +#loc195 = loc("tmp32"(#loc88)) +#loc196 = loc("tmp34"(#loc89)) +#loc197 = loc("tmp36"(#loc90)) +#loc198 = loc("tmp37"(#loc91)) +#loc199 = loc("tmp38"(#loc92)) diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..53961e330d1b0b3c12cf7204ff1cc228278ac7cd --- /dev/null +++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir @@ -0,0 +1,287 @@ +#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("in_ptr4"(#loc)) +#loc75 = loc("in_ptr5"(#loc)) +#loc76 = loc("out_ptr0"(#loc)) +#loc77 = loc("ynumel"(#loc)) +#loc78 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<-256> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<-256> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<256> : tensor<32x1xi64, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<256> : tensor<32x1xi64, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<32> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<32> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<1x32xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<1x32xi32, #blocked1> loc(#loc1) + %cst_9 = arith.constant dense<73728> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<73728> : tensor<32x1xi32, #blocked1> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<32x32xbf16, #blocked1> loc(#loc1) + %cst_12 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<9.99999997E-7> : tensor<32x32xf32, #blocked> loc(#loc1) + %cst_14 = arith.constant dense<1.280000e+02> : tensor<32x32xf32, #blocked> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked1> loc(#loc1) + %yoffset = tt.get_program_id y : i32 loc(#loc79) + %yoffset_16 = tt.get_program_id z : i32 loc(#loc80) + %yoffset_17 = tt.get_num_programs y : i32 loc(#loc81) + %yoffset_18 = arith.muli %yoffset_16, %yoffset_17 : i32 loc(#loc82) + %yoffset_19 = arith.addi %yoffset, %yoffset_18 : i32 loc(#loc83) + %yoffset_20 = arith.muli %yoffset_19, %c32_i32 : i32 loc(#loc84) + %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc85) + %yindex_21 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc85) + %yindex_22 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> loc(#loc85) + %yindex_23 = tt.expand_dims %yindex_21 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc85) + %yindex_24 = tt.splat %yoffset_20 : i32 -> tensor<32x1xi32, #blocked1> loc(#loc86) + %yindex_25 = tt.splat %yoffset_20 : i32 -> tensor<32x1xi32, #blocked> loc(#loc86) + %yindex_26 = arith.addi %yindex_24, %yindex_22 : tensor<32x1xi32, #blocked1> loc(#loc86) + %yindex_27 = arith.addi %yindex_25, %yindex_23 : tensor<32x1xi32, #blocked> loc(#loc86) + %ymask = arith.cmpi slt, %yindex_26, %cst_10 : tensor<32x1xi32, #blocked1> loc(#loc87) + %ymask_28 = arith.cmpi slt, %yindex_27, %cst_9 : tensor<32x1xi32, #blocked> loc(#loc87) + %xoffset = tt.get_program_id x : i32 loc(#loc88) + %xoffset_29 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc89) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc90) + %xindex_30 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc90) + %xindex_31 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc90) + %xindex_32 = tt.expand_dims %xindex_30 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc90) + %xindex_33 = tt.splat %xoffset_29 : i32 -> tensor<1x32xi32, #blocked1> loc(#loc91) + %xindex_34 = tt.splat %xoffset_29 : i32 -> tensor<1x32xi32, #blocked> loc(#loc91) + %xindex_35 = arith.addi %xindex_33, %xindex_31 : tensor<1x32xi32, #blocked1> loc(#loc91) + %xindex_36 = arith.addi %xindex_34, %xindex_32 : tensor<1x32xi32, #blocked> loc(#loc91) + %xmask = arith.cmpi slt, %xindex_35, %cst_8 : tensor<1x32xi32, #blocked1> loc(#loc92) + %xmask_37 = arith.cmpi slt, %xindex_36, %cst_7 : tensor<1x32xi32, #blocked> loc(#loc92) + %y1 = arith.divsi %yindex_26, %cst_6 : tensor<32x1xi32, #blocked1> loc(#loc93) + %y1_38 = arith.divsi %yindex_27, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc93) + %y0 = arith.remsi %yindex_26, %cst_6 : tensor<32x1xi32, #blocked1> loc(#loc94) + %y0_39 = arith.remsi %yindex_27, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc94) + %tmp4 = arith.extsi %y1 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> loc(#loc95) + %tmp4_40 = arith.extsi %y1_38 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> loc(#loc95) + %tmp4_41 = arith.cmpi slt, %tmp4, %cst_4 : tensor<32x1xi64, #blocked1> loc(#loc95) + %tmp4_42 = arith.cmpi slt, %tmp4_40, %cst_3 : tensor<32x1xi64, #blocked> loc(#loc95) + %tmp5 = arith.muli %y0, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc96) + %tmp5_43 = tt.broadcast %xindex_35 : tensor<1x32xi32, #blocked1> -> tensor<32x32xi32, #blocked1> loc(#loc97) + %tmp5_44 = tt.broadcast %tmp5 : tensor<32x1xi32, #blocked1> -> tensor<32x32xi32, #blocked1> loc(#loc97) + %tmp5_45 = arith.addi %tmp5_43, %tmp5_44 : tensor<32x32xi32, #blocked1> loc(#loc97) + %tmp5_46 = arith.muli %y1, %cst_1 : tensor<32x1xi32, #blocked1> loc(#loc98) + %tmp5_47 = tt.broadcast %tmp5_46 : tensor<32x1xi32, #blocked1> -> tensor<32x32xi32, #blocked1> loc(#loc99) + %tmp5_48 = arith.addi %tmp5_45, %tmp5_47 : tensor<32x32xi32, #blocked1> loc(#loc99) + %tmp5_49 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> loc(#loc100) + %tmp5_50 = tt.addptr %tmp5_49, %tmp5_48 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi32, #blocked1> loc(#loc100) + %tmp5_51 = tt.broadcast %tmp4_41 : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> loc(#loc101) + %tmp5_52 = tt.broadcast %tmp4_42 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> loc(#loc101) + %tmp5_53 = tt.broadcast %xmask : tensor<1x32xi1, #blocked1> -> tensor<32x32xi1, #blocked1> loc(#loc101) + %tmp5_54 = tt.broadcast %xmask_37 : tensor<1x32xi1, #blocked> -> tensor<32x32xi1, #blocked> loc(#loc101) + %tmp5_55 = arith.andi %tmp5_51, %tmp5_53 : tensor<32x32xi1, #blocked1> loc(#loc101) + %tmp5_56 = arith.andi %tmp5_52, %tmp5_54 : tensor<32x32xi1, #blocked> loc(#loc101) + %tmp5_57 = tt.broadcast %ymask : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> loc(#loc102) + %tmp5_58 = tt.broadcast %ymask_28 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> loc(#loc102) + %tmp5_59 = arith.andi %tmp5_55, %tmp5_57 : tensor<32x32xi1, #blocked1> loc(#loc102) + %tmp5_60 = arith.andi %tmp5_56, %tmp5_58 : tensor<32x32xi1, #blocked> loc(#loc102) + %tmp5_61 = tt.load %tmp5_50, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<32x32x!tt.ptr, #blocked1> loc(#loc103) + %tmp5_62 = ttg.convert_layout %tmp5_61 : tensor<32x32xbf16, #blocked1> -> tensor<32x32xbf16, #blocked> loc(#loc104) + %tmp5_63 = arith.extf %tmp5_62 : tensor<32x32xbf16, #blocked> to tensor<32x32xf32, #blocked> loc(#loc104) + %tmp7 = arith.muli %y1_38, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc105) + %tmp7_64 = arith.addi %y0_39, %tmp7 : tensor<32x1xi32, #blocked> loc(#loc106) + %tmp7_65 = tt.splat %in_ptr1 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> loc(#loc107) + %tmp7_66 = tt.addptr %tmp7_65, %tmp7_64 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> loc(#loc107) + %tmp7_67 = tt.broadcast %tmp7_66 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> loc(#loc107) + %tmp7_68 = tt.load %tmp7_67, %tmp5_60, %cst_12 evictionPolicy = evict_last : tensor<32x32x!tt.ptr, #blocked> loc(#loc108) + %tmp9 = arith.divf %tmp7_68, %cst_14 : tensor<32x32xf32, #blocked> loc(#loc109) + %tmp11 = arith.addf %tmp9, %cst_13 : tensor<32x32xf32, #blocked> loc(#loc110) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32, #blocked>) -> tensor<32x32xf32, #blocked> loc(#loc111) + %tmp13 = arith.mulf %tmp5_63, %tmp12 : tensor<32x32xf32, #blocked> loc(#loc112) + %tmp13_69 = ttg.convert_layout %tmp13 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1> loc(#loc112) + %tmp14 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc113) + %tmp14_70 = tt.addptr %tmp14, %xindex_35 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi32, #blocked1> loc(#loc113) + %tmp14_71 = tt.broadcast %tmp14_70 : tensor<1x32x!tt.ptr, #blocked1> -> tensor<32x32x!tt.ptr, #blocked1> loc(#loc113) + %tmp14_72 = tt.load %tmp14_71, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<32x32x!tt.ptr, #blocked1> loc(#loc114) + %tmp14_73 = arith.extf %tmp14_72 : tensor<32x32xbf16, #blocked1> to tensor<32x32xf32, #blocked1> loc(#loc115) + %tmp16 = arith.mulf %tmp13_69, %tmp14_73 : tensor<32x32xf32, #blocked1> loc(#loc116) + %tmp20 = arith.cmpi sge, %tmp4, %cst_4 : tensor<32x1xi64, #blocked1> loc(#loc117) + %tmp20_74 = arith.cmpi sge, %tmp4_40, %cst_3 : tensor<32x1xi64, #blocked> loc(#loc117) + %tmp23 = arith.addi %y1, %cst_0 : tensor<32x1xi32, #blocked1> loc(#loc118) + %tmp23_75 = arith.addi %y1_38, %cst : tensor<32x1xi32, #blocked> loc(#loc118) + %tmp23_76 = arith.muli %tmp23, %cst_1 : tensor<32x1xi32, #blocked1> loc(#loc119) + %tmp23_77 = tt.broadcast %tmp23_76 : tensor<32x1xi32, #blocked1> -> tensor<32x32xi32, #blocked1> loc(#loc120) + %tmp23_78 = arith.addi %tmp5_45, %tmp23_77 : tensor<32x32xi32, #blocked1> loc(#loc120) + %tmp23_79 = tt.splat %in_ptr3 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> loc(#loc121) + %tmp23_80 = tt.addptr %tmp23_79, %tmp23_78 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi32, #blocked1> loc(#loc121) + %tmp23_81 = tt.broadcast %tmp20 : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> loc(#loc122) + %tmp23_82 = tt.broadcast %tmp20_74 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> loc(#loc122) + %tmp23_83 = arith.andi %tmp23_81, %tmp5_53 : tensor<32x32xi1, #blocked1> loc(#loc122) + %tmp23_84 = arith.andi %tmp23_82, %tmp5_54 : tensor<32x32xi1, #blocked> loc(#loc122) + %tmp23_85 = arith.andi %tmp23_83, %tmp5_57 : tensor<32x32xi1, #blocked1> loc(#loc123) + %tmp23_86 = arith.andi %tmp23_84, %tmp5_58 : tensor<32x32xi1, #blocked> loc(#loc123) + %tmp23_87 = tt.load %tmp23_80, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<32x32x!tt.ptr, #blocked1> loc(#loc124) + %tmp23_88 = ttg.convert_layout %tmp23_87 : tensor<32x32xbf16, #blocked1> -> tensor<32x32xbf16, #blocked> loc(#loc125) + %tmp23_89 = arith.extf %tmp23_88 : tensor<32x32xbf16, #blocked> to tensor<32x32xf32, #blocked> loc(#loc125) + %tmp25 = arith.muli %tmp23_75, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc126) + %tmp25_90 = arith.addi %y0_39, %tmp25 : tensor<32x1xi32, #blocked> loc(#loc127) + %tmp25_91 = tt.splat %in_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> loc(#loc128) + %tmp25_92 = tt.addptr %tmp25_91, %tmp25_90 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> loc(#loc128) + %tmp25_93 = tt.broadcast %tmp25_92 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> loc(#loc128) + %tmp25_94 = tt.load %tmp25_93, %tmp23_86, %cst_12 evictionPolicy = evict_last : tensor<32x32x!tt.ptr, #blocked> loc(#loc129) + %tmp27 = arith.divf %tmp25_94, %cst_14 : tensor<32x32xf32, #blocked> loc(#loc130) + %tmp29 = arith.addf %tmp27, %cst_13 : tensor<32x32xf32, #blocked> loc(#loc131) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32, #blocked>) -> tensor<32x32xf32, #blocked> loc(#loc132) + %tmp31 = arith.mulf %tmp23_89, %tmp30 : tensor<32x32xf32, #blocked> loc(#loc133) + %tmp31_95 = ttg.convert_layout %tmp31 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1> loc(#loc133) + %tmp32 = tt.splat %in_ptr5 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc134) + %tmp32_96 = tt.addptr %tmp32, %xindex_35 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi32, #blocked1> loc(#loc134) + %tmp32_97 = tt.broadcast %tmp32_96 : tensor<1x32x!tt.ptr, #blocked1> -> tensor<32x32x!tt.ptr, #blocked1> loc(#loc134) + %tmp32_98 = tt.load %tmp32_97, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<32x32x!tt.ptr, #blocked1> loc(#loc135) + %tmp32_99 = arith.extf %tmp32_98 : tensor<32x32xbf16, #blocked1> to tensor<32x32xf32, #blocked1> loc(#loc136) + %tmp34 = arith.mulf %tmp31_95, %tmp32_99 : tensor<32x32xf32, #blocked1> loc(#loc137) + %tmp37 = arith.select %tmp23_81, %tmp34, %cst_15 : tensor<32x32xi1, #blocked1>, tensor<32x32xf32, #blocked1> loc(#loc138) + %tmp38 = arith.select %tmp5_51, %tmp16, %tmp37 : tensor<32x32xi1, #blocked1>, tensor<32x32xf32, #blocked1> loc(#loc141) + %0 = arith.muli %yindex_26, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc64) + %1 = tt.broadcast %0 : tensor<32x1xi32, #blocked1> -> tensor<32x32xi32, #blocked1> loc(#loc65) + %2 = arith.addi %tmp5_43, %1 : tensor<32x32xi32, #blocked1> loc(#loc65) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> loc(#loc66) + %4 = tt.addptr %3, %2 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi32, #blocked1> loc(#loc66) + %5 = arith.andi %tmp5_53, %tmp5_57 : tensor<32x32xi1, #blocked1> loc(#loc67) + %6 = arith.truncf %tmp38 : tensor<32x32xf32, #blocked1> to tensor<32x32xbf16, #blocked1> loc(#loc68) + tt.store %4, %6, %5 : tensor<32x32x!tt.ptr, #blocked1> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc79 = loc("yoffset"(#loc2)) +#loc80 = loc("yoffset"(#loc3)) +#loc81 = loc("yoffset"(#loc4)) +#loc82 = loc("yoffset"(#loc5)) +#loc83 = loc("yoffset"(#loc6)) +#loc84 = loc("yoffset"(#loc7)) +#loc85 = loc("yindex"(#loc8)) +#loc86 = loc("yindex"(#loc9)) +#loc87 = loc("ymask"(#loc10)) +#loc88 = loc("xoffset"(#loc11)) +#loc89 = loc("xoffset"(#loc12)) +#loc90 = loc("xindex"(#loc13)) +#loc91 = loc("xindex"(#loc14)) +#loc92 = loc("xmask"(#loc15)) +#loc93 = loc("y1"(#loc16)) +#loc94 = loc("y0"(#loc17)) +#loc95 = loc("tmp4"(#loc18)) +#loc96 = loc("tmp5"(#loc19)) +#loc97 = loc("tmp5"(#loc20)) +#loc98 = loc("tmp5"(#loc21)) +#loc99 = loc("tmp5"(#loc22)) +#loc100 = loc("tmp5"(#loc23)) +#loc101 = loc("tmp5"(#loc24)) +#loc102 = loc("tmp5"(#loc25)) +#loc103 = loc("tmp5"(#loc26)) +#loc104 = loc("tmp5"(#loc27)) +#loc105 = loc("tmp7"(#loc28)) +#loc106 = loc("tmp7"(#loc29)) +#loc107 = loc("tmp7"(#loc30)) +#loc108 = loc("tmp7"(#loc31)) +#loc109 = loc("tmp9"(#loc32)) +#loc110 = loc("tmp11"(#loc33)) +#loc111 = loc("tmp12"(#loc34)) +#loc112 = loc("tmp13"(#loc35)) +#loc113 = loc("tmp14"(#loc36)) +#loc114 = loc("tmp14"(#loc37)) +#loc115 = loc("tmp14"(#loc38)) +#loc116 = loc("tmp16"(#loc39)) +#loc117 = loc("tmp20"(#loc40)) +#loc118 = loc("tmp23"(#loc41)) +#loc119 = loc("tmp23"(#loc42)) +#loc120 = loc("tmp23"(#loc43)) +#loc121 = loc("tmp23"(#loc44)) +#loc122 = loc("tmp23"(#loc45)) +#loc123 = loc("tmp23"(#loc46)) +#loc124 = loc("tmp23"(#loc47)) +#loc125 = loc("tmp23"(#loc48)) +#loc126 = loc("tmp25"(#loc49)) +#loc127 = loc("tmp25"(#loc50)) +#loc128 = loc("tmp25"(#loc51)) +#loc129 = loc("tmp25"(#loc52)) +#loc130 = loc("tmp27"(#loc53)) +#loc131 = loc("tmp29"(#loc54)) +#loc132 = loc("tmp30"(#loc55)) +#loc133 = loc("tmp31"(#loc56)) +#loc134 = loc("tmp32"(#loc57)) +#loc135 = loc("tmp32"(#loc58)) +#loc136 = loc("tmp32"(#loc59)) +#loc137 = loc("tmp34"(#loc60)) +#loc138 = loc("tmp37"(#loc61)) +#loc139 = loc("tmp38"(#loc62)) +#loc140 = loc("tmp19"(#loc63)) +#loc141 = loc(fused[#loc139, #loc140]) diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..f48e3f6109303d1ab499e3be6e39babdceba0dce --- /dev/null +++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir @@ -0,0 +1,252 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc71 = loc("in_ptr0"(#loc)) +#loc72 = loc("in_ptr1"(#loc)) +#loc73 = loc("in_ptr2"(#loc)) +#loc74 = loc("in_ptr3"(#loc)) +#loc75 = loc("in_ptr4"(#loc)) +#loc76 = loc("in_ptr5"(#loc)) +#loc77 = loc("out_ptr0"(#loc)) +#loc78 = loc("ynumel"(#loc)) +#loc79 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xbf16> loc(#loc1) + %cst_0 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<9.99999997E-7> : tensor<32x32xf32> loc(#loc1) + %cst_2 = arith.constant dense<1.280000e+02> : tensor<32x32xf32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc1) + %cst_4 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc1) + %cst_5 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc1) + %cst_6 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc1) + %cst_7 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc1) + %xmask = arith.constant dense<128> : tensor<1x32xi32> loc(#loc80) + %ymask = arith.constant dense<73728> : tensor<32x1xi32> loc(#loc81) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %yoffset = tt.get_program_id y : i32 loc(#loc82) + %yoffset_8 = tt.get_program_id z : i32 loc(#loc83) + %yoffset_9 = tt.get_num_programs y : i32 loc(#loc84) + %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc85) + %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc86) + %yoffset_12 = arith.muli %yoffset_11, %c32_i32 : i32 loc(#loc87) + %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc88) + %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc89) + %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<32x1xi32> loc(#loc90) + %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<32x1xi32> loc(#loc90) + %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<32x1xi32> loc(#loc81) + %xoffset = tt.get_program_id x : i32 loc(#loc91) + %xoffset_17 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc92) + %xindex = tt.expand_dims %yindex {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc93) + %xindex_18 = tt.splat %xoffset_17 : i32 -> tensor<1x32xi32> loc(#loc94) + %xindex_19 = arith.addi %xindex_18, %xindex : tensor<1x32xi32> loc(#loc94) + %xmask_20 = arith.cmpi slt, %xindex_19, %xmask : tensor<1x32xi32> loc(#loc80) + %y1 = arith.divsi %yindex_15, %cst_7 : tensor<32x1xi32> loc(#loc95) + %y0 = arith.remsi %yindex_15, %cst_7 : tensor<32x1xi32> loc(#loc96) + %tmp4 = arith.extsi %y1 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc97) + %tmp4_21 = arith.cmpi slt, %tmp4, %cst_6 : tensor<32x1xi64> loc(#loc97) + %tmp5 = arith.muli %y0, %cst_5 : tensor<32x1xi32> loc(#loc98) + %tmp5_22 = tt.broadcast %xindex_19 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc99) + %tmp5_23 = tt.broadcast %tmp5 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc99) + %tmp5_24 = arith.addi %tmp5_22, %tmp5_23 : tensor<32x32xi32> loc(#loc99) + %tmp5_25 = arith.muli %y1, %cst_4 : tensor<32x1xi32> loc(#loc100) + %tmp5_26 = tt.broadcast %tmp5_25 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc101) + %tmp5_27 = arith.addi %tmp5_24, %tmp5_26 : tensor<32x32xi32> loc(#loc101) + %tmp5_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x32x!tt.ptr> loc(#loc102) + %tmp5_29 = tt.addptr %tmp5_28, %tmp5_27 : tensor<32x32x!tt.ptr>, tensor<32x32xi32> loc(#loc102) + %tmp5_30 = tt.broadcast %tmp4_21 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc103) + %tmp5_31 = tt.broadcast %xmask_20 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc103) + %tmp5_32 = arith.andi %tmp5_30, %tmp5_31 : tensor<32x32xi1> loc(#loc103) + %tmp5_33 = tt.broadcast %ymask_16 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc104) + %tmp5_34 = arith.andi %tmp5_32, %tmp5_33 : tensor<32x32xi1> loc(#loc104) + %tmp5_35 = tt.load %tmp5_29, %tmp5_34, %cst evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc105) + %tmp5_36 = arith.extf %tmp5_35 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc106) + %tmp7 = arith.muli %y1, %cst_7 : tensor<32x1xi32> loc(#loc107) + %tmp7_37 = arith.addi %y0, %tmp7 : tensor<32x1xi32> loc(#loc108) + %tmp7_38 = tt.splat %in_ptr1 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc109) + %tmp7_39 = tt.addptr %tmp7_38, %tmp7_37 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc109) + %tmp7_40 = tt.broadcast %tmp7_39 : tensor<32x1x!tt.ptr> -> tensor<32x32x!tt.ptr> loc(#loc109) + %tmp7_41 = tt.load %tmp7_40, %tmp5_34, %cst_3 evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc110) + %tmp9 = arith.divf %tmp7_41, %cst_2 : tensor<32x32xf32> loc(#loc111) + %tmp11 = arith.addf %tmp9, %cst_1 : tensor<32x32xf32> loc(#loc112) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32>) -> tensor<32x32xf32> loc(#loc113) + %tmp13 = arith.mulf %tmp5_36, %tmp12 : tensor<32x32xf32> loc(#loc114) + %tmp14 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc115) + %tmp14_42 = tt.addptr %tmp14, %xindex_19 : tensor<1x32x!tt.ptr>, tensor<1x32xi32> loc(#loc115) + %tmp14_43 = tt.broadcast %tmp14_42 : tensor<1x32x!tt.ptr> -> tensor<32x32x!tt.ptr> loc(#loc115) + %tmp14_44 = tt.load %tmp14_43, %tmp5_34, %cst evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc116) + %tmp14_45 = arith.extf %tmp14_44 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc117) + %tmp16 = arith.mulf %tmp13, %tmp14_45 : tensor<32x32xf32> loc(#loc118) + %tmp19 = arith.select %tmp5_30, %tmp16, %cst_3 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc119) + %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<32x1xi64> loc(#loc120) + %tmp23 = arith.addi %y1, %cst_0 : tensor<32x1xi32> loc(#loc121) + %tmp23_46 = arith.muli %tmp23, %cst_4 : tensor<32x1xi32> loc(#loc122) + %tmp23_47 = tt.broadcast %tmp23_46 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc123) + %tmp23_48 = arith.addi %tmp5_24, %tmp23_47 : tensor<32x32xi32> loc(#loc123) + %tmp23_49 = tt.splat %in_ptr3 : !tt.ptr -> tensor<32x32x!tt.ptr> loc(#loc124) + %tmp23_50 = tt.addptr %tmp23_49, %tmp23_48 : tensor<32x32x!tt.ptr>, tensor<32x32xi32> loc(#loc124) + %tmp23_51 = tt.broadcast %tmp20 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc125) + %tmp23_52 = arith.andi %tmp23_51, %tmp5_31 : tensor<32x32xi1> loc(#loc125) + %tmp23_53 = arith.andi %tmp23_52, %tmp5_33 : tensor<32x32xi1> loc(#loc126) + %tmp23_54 = tt.load %tmp23_50, %tmp23_53, %cst evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc127) + %tmp23_55 = arith.extf %tmp23_54 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc128) + %tmp25 = arith.muli %tmp23, %cst_7 : tensor<32x1xi32> loc(#loc129) + %tmp25_56 = arith.addi %y0, %tmp25 : tensor<32x1xi32> loc(#loc130) + %tmp25_57 = tt.splat %in_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc131) + %tmp25_58 = tt.addptr %tmp25_57, %tmp25_56 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc131) + %tmp25_59 = tt.broadcast %tmp25_58 : tensor<32x1x!tt.ptr> -> tensor<32x32x!tt.ptr> loc(#loc131) + %tmp25_60 = tt.load %tmp25_59, %tmp23_53, %cst_3 evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc132) + %tmp27 = arith.divf %tmp25_60, %cst_2 : tensor<32x32xf32> loc(#loc133) + %tmp29 = arith.addf %tmp27, %cst_1 : tensor<32x32xf32> loc(#loc134) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32>) -> tensor<32x32xf32> loc(#loc135) + %tmp31 = arith.mulf %tmp23_55, %tmp30 : tensor<32x32xf32> loc(#loc136) + %tmp32 = tt.splat %in_ptr5 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc137) + %tmp32_61 = tt.addptr %tmp32, %xindex_19 : tensor<1x32x!tt.ptr>, tensor<1x32xi32> loc(#loc137) + %tmp32_62 = tt.broadcast %tmp32_61 : tensor<1x32x!tt.ptr> -> tensor<32x32x!tt.ptr> loc(#loc137) + %tmp32_63 = tt.load %tmp32_62, %tmp23_53, %cst evictionPolicy = evict_last : tensor<32x32x!tt.ptr> loc(#loc138) + %tmp32_64 = arith.extf %tmp32_63 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc139) + %tmp34 = arith.mulf %tmp31, %tmp32_64 : tensor<32x32xf32> loc(#loc140) + %tmp37 = arith.select %tmp23_51, %tmp34, %cst_3 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc141) + %tmp38 = arith.select %tmp5_30, %tmp19, %tmp37 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc142) + %0 = arith.muli %yindex_15, %cst_5 : tensor<32x1xi32> loc(#loc65) + %1 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc66) + %2 = arith.addi %tmp5_22, %1 : tensor<32x32xi32> loc(#loc66) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<32x32x!tt.ptr> loc(#loc67) + %4 = tt.addptr %3, %2 : tensor<32x32x!tt.ptr>, tensor<32x32xi32> loc(#loc67) + %5 = arith.andi %tmp5_31, %tmp5_33 : tensor<32x32xi1> loc(#loc68) + %6 = arith.truncf %tmp38 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc69) + tt.store %4, %6, %5 : tensor<32x32x!tt.ptr> loc(#loc69) + tt.return loc(#loc70) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc80 = loc("xmask"(#loc2)) +#loc81 = loc("ymask"(#loc3)) +#loc82 = loc("yoffset"(#loc4)) +#loc83 = loc("yoffset"(#loc5)) +#loc84 = loc("yoffset"(#loc6)) +#loc85 = loc("yoffset"(#loc7)) +#loc86 = loc("yoffset"(#loc8)) +#loc87 = loc("yoffset"(#loc9)) +#loc88 = loc("yindex"(#loc10)) +#loc89 = loc("yindex"(#loc11)) +#loc90 = loc("yindex"(#loc12)) +#loc91 = loc("xoffset"(#loc13)) +#loc92 = loc("xoffset"(#loc14)) +#loc93 = loc("xindex"(#loc15)) +#loc94 = loc("xindex"(#loc16)) +#loc95 = loc("y1"(#loc17)) +#loc96 = loc("y0"(#loc18)) +#loc97 = loc("tmp4"(#loc19)) +#loc98 = loc("tmp5"(#loc20)) +#loc99 = loc("tmp5"(#loc21)) +#loc100 = loc("tmp5"(#loc22)) +#loc101 = loc("tmp5"(#loc23)) +#loc102 = loc("tmp5"(#loc24)) +#loc103 = loc("tmp5"(#loc25)) +#loc104 = loc("tmp5"(#loc26)) +#loc105 = loc("tmp5"(#loc27)) +#loc106 = loc("tmp5"(#loc28)) +#loc107 = loc("tmp7"(#loc29)) +#loc108 = loc("tmp7"(#loc30)) +#loc109 = loc("tmp7"(#loc31)) +#loc110 = loc("tmp7"(#loc32)) +#loc111 = loc("tmp9"(#loc33)) +#loc112 = loc("tmp11"(#loc34)) +#loc113 = loc("tmp12"(#loc35)) +#loc114 = loc("tmp13"(#loc36)) +#loc115 = loc("tmp14"(#loc37)) +#loc116 = loc("tmp14"(#loc38)) +#loc117 = loc("tmp14"(#loc39)) +#loc118 = loc("tmp16"(#loc40)) +#loc119 = loc("tmp19"(#loc41)) +#loc120 = loc("tmp20"(#loc42)) +#loc121 = loc("tmp23"(#loc43)) +#loc122 = loc("tmp23"(#loc44)) +#loc123 = loc("tmp23"(#loc45)) +#loc124 = loc("tmp23"(#loc46)) +#loc125 = loc("tmp23"(#loc47)) +#loc126 = loc("tmp23"(#loc48)) +#loc127 = loc("tmp23"(#loc49)) +#loc128 = loc("tmp23"(#loc50)) +#loc129 = loc("tmp25"(#loc51)) +#loc130 = loc("tmp25"(#loc52)) +#loc131 = loc("tmp25"(#loc53)) +#loc132 = loc("tmp25"(#loc54)) +#loc133 = loc("tmp27"(#loc55)) +#loc134 = loc("tmp29"(#loc56)) +#loc135 = loc("tmp30"(#loc57)) +#loc136 = loc("tmp31"(#loc58)) +#loc137 = loc("tmp32"(#loc59)) +#loc138 = loc("tmp32"(#loc60)) +#loc139 = loc("tmp32"(#loc61)) +#loc140 = loc("tmp34"(#loc62)) +#loc141 = loc("tmp37"(#loc63)) +#loc142 = loc("tmp38"(#loc64)) diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/__grp__triton_poi_fused_clone_permute_2.json b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/__grp__triton_poi_fused_clone_permute_2.json new file mode 100644 index 0000000000000000000000000000000000000000..903c72293c624aecc79f6067774535d1dabfb681 --- /dev/null +++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/__grp__triton_poi_fused_clone_permute_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_clone_permute_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.source", "triton_poi_fused_clone_permute_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttir", "triton_poi_fused_clone_permute_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttgir", "triton_poi_fused_clone_permute_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.llir", "triton_poi_fused_clone_permute_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ptx", "triton_poi_fused_clone_permute_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.cubin", "triton_poi_fused_clone_permute_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.json"}} \ No newline at end of file diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.cubin b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..2437c4b29a514d6b45a8e6011640ad0e2c299a56 Binary files /dev/null and b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.cubin differ diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.json b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4145f3013225748d5438e23ca6d7a9884e6c61d5 --- /dev/null +++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.json @@ -0,0 +1 @@ +{"hash": "59acd44dc9397a3ccb49e49a906a22d576d15ba8ef13c528fff358a0f5f3f39f", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_permute_2"} \ No newline at end of file diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.llir b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..caaf59d1e6aefc0aea76717363909c38e62a6758 --- /dev/null +++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.llir @@ -0,0 +1,71 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_clone_permute_2(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 10, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 3, !dbg !9 + %10 = and i32 %9, 1016, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = sdiv i32 %11, 128, !dbg !11 + %13 = mul i32 %12, 128, !dbg !12 + %.decomposed = sub i32 %11, %13, !dbg !12 + %14 = srem i32 %12, 32, !dbg !13 + %15 = sdiv i32 %11, 4096, !dbg !14 + %16 = shl nsw i32 %15, 7, !dbg !15 + %17 = add nsw i32 %16, %.decomposed, !dbg !16 + %18 = mul nsw i32 %14, 294912, !dbg !17 + %19 = add nsw i32 %17, %18, !dbg !18 + %20 = sext i32 %19 to i64, !dbg !19 + %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19 + %22 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %21) #2, !dbg !20 + %23 = extractvalue { i32, i32, i32, i32 } %22, 0, !dbg !20 + %24 = extractvalue { i32, i32, i32, i32 } %22, 1, !dbg !20 + %25 = extractvalue { i32, i32, i32, i32 } %22, 2, !dbg !20 + %26 = extractvalue { i32, i32, i32, i32 } %22, 3, !dbg !20 + %27 = sext i32 %11 to i64, !dbg !21 + %28 = getelementptr bfloat, ptr addrspace(1) %1, i64 %27, !dbg !21 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %23, i32 %24, i32 %25, i32 %26, ptr addrspace(1) %28) #2, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_permute_2", linkageName: "triton_poi_fused_clone_permute_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 23, column: 19, scope: !4) +!13 = !DILocation(line: 24, column: 28, scope: !4) +!14 = !DILocation(line: 25, column: 19, scope: !4) +!15 = !DILocation(line: 27, column: 39, scope: !4) +!16 = !DILocation(line: 27, column: 35, scope: !4) +!17 = !DILocation(line: 27, column: 51, scope: !4) +!18 = !DILocation(line: 27, column: 44, scope: !4) +!19 = !DILocation(line: 27, column: 30, scope: !4) +!20 = !DILocation(line: 27, column: 56, scope: !4) +!21 = !DILocation(line: 28, column: 25, scope: !4) +!22 = !DILocation(line: 28, column: 36, scope: !4) +!23 = !DILocation(line: 28, column: 4, scope: !4) diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ptx b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..d7aee7940e0b44591a40af1b2d2c6128c743133f --- /dev/null +++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ptx @@ -0,0 +1,327 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_clone_permute_2 // -- Begin function triton_poi_fused_clone_permute_2 + // @triton_poi_fused_clone_permute_2 +.visible .entry triton_poi_fused_clone_permute_2( + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_1, + .param .u32 triton_poi_fused_clone_permute_2_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_4 +) +.reqntid 128 +{ + .reg .b32 %r<27>; + .reg .b64 %rd<5>; + .loc 1 18 0 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_poi_fused_clone_permute_2_param_0]; + ld.param.b64 %rd4, [triton_poi_fused_clone_permute_2_param_1]; +$L__tmp0: + .loc 1 20 28 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:20:28 + mov.u32 %r5, %ctaid.x; + .loc 1 20 33 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:20:33 + shl.b32 %r6, %r5, 10; + .loc 1 21 36 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:21:36 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 3; + and.b32 %r9, %r8, 1016; + .loc 1 21 23 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:21:23 + or.b32 %r10, %r9, %r6; + .loc 1 24 21 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:24:21 + bfe.s32 %r11, %r5, 21, 1; + shr.u32 %r12, %r11, 25; + add.s32 %r13, %r10, %r12; + shr.s32 %r14, %r13, 7; + .loc 1 23 19 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:23:19 + and.b32 %r15, %r13, -128; + sub.s32 %r16, %r10, %r15; + .loc 1 24 28 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:24:28 + shr.u32 %r17, %r14, 27; + add.s32 %r18, %r14, %r17; + and.b32 %r19, %r18, 131040; + sub.s32 %r20, %r14, %r19; + .loc 1 25 19 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:25:19 + shr.u32 %r21, %r11, 20; + add.s32 %r22, %r10, %r21; + shr.s32 %r23, %r22, 12; + .loc 1 27 39 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:39 + shl.b32 %r24, %r23, 7; + .loc 1 27 35 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:35 + add.s32 %r25, %r24, %r16; + .loc 1 27 44 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:44 + mad.lo.s32 %r26, %r20, 294912, %r25; + .loc 1 27 30 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:30 + mad.wide.s32 %rd1, %r26, 2, %rd3; + .loc 1 27 56 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:56 + // begin inline asm + mov.u32 %r1, 0x0; + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 28 25 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:25 + mad.wide.s32 %rd2, %r10, 2, %rd4; + .loc 1 28 36 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:36 + // begin inline asm + st.global.v4.b32 [ %rd2 + 0 ], { %r1, %r2, %r3, %r4 }; + // end inline asm + .loc 1 28 4 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 106 +.b8 52 +.b8 55 +.b8 118 +.b8 122 +.b8 50 +.b8 117 +.b8 55 +.b8 105 +.b8 51 +.b8 116 +.b8 104 +.b8 53 +.b8 51 +.b8 99 +.b8 102 +.b8 50 +.b8 101 +.b8 108 +.b8 99 +.b8 53 +.b8 102 +.b8 105 +.b8 121 +.b8 108 +.b8 118 +.b8 121 +.b8 107 +.b8 55 +.b8 111 +.b8 51 +.b8 110 +.b8 105 +.b8 50 +.b8 112 +.b8 110 +.b8 52 +.b8 99 +.b8 50 +.b8 98 +.b8 100 +.b8 100 +.b8 114 +.b8 122 +.b8 113 +.b8 53 +.b8 106 +.b8 110 +.b8 117 +.b8 110 +.b8 113 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 106 +.b8 52 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.source b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.source new file mode 100644 index 0000000000000000000000000000000000000000..a4f167f87bf44ebdc32b58d5074809003c95ca3b --- /dev/null +++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.source @@ -0,0 +1,90 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("out_ptr0"(#loc)) +#loc23 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc26) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc26) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc28) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc28) + %xmask = arith.constant true loc(#loc29) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc29) + %x0 = arith.constant 128 : i32 loc(#loc30) + %x0_7 = arith.constant 128 : i32 loc(#loc30) + %x0_8 = arith.constant dense<128> : tensor<1024xi32> loc(#loc30) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc30) + %x1 = arith.constant 128 : i32 loc(#loc31) + %x1_10 = arith.constant 128 : i32 loc(#loc31) + %x1_11 = arith.constant dense<128> : tensor<1024xi32> loc(#loc31) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc31) + %x1_13 = arith.constant 32 : i32 loc(#loc32) + %x1_14 = arith.constant 32 : i32 loc(#loc32) + %x1_15 = arith.constant dense<32> : tensor<1024xi32> loc(#loc32) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1024xi32> loc(#loc32) + %x2 = arith.constant 4096 : i32 loc(#loc33) + %x2_17 = arith.constant 4096 : i32 loc(#loc33) + %x2_18 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc33) + %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<1024xi32> loc(#loc33) + %tmp0 = arith.constant 128 : i32 loc(#loc34) + %tmp0_20 = arith.constant 128 : i32 loc(#loc34) + %tmp0_21 = arith.constant dense<128> : tensor<1024xi32> loc(#loc34) + %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<1024xi32> loc(#loc34) + %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<1024xi32> loc(#loc35) + %tmp0_24 = arith.constant 294912 : i32 loc(#loc36) + %tmp0_25 = arith.constant 294912 : i32 loc(#loc36) + %tmp0_26 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc36) + %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<1024xi32> loc(#loc36) + %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<1024xi32> loc(#loc37) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc38) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc38) + %tmp0_31 = tt.load %tmp0_30 : tensor<1024x!tt.ptr> loc(#loc39) + %tmp0_32 = arith.extf %tmp0_31 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc40) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc18) + %2 = arith.truncf %tmp0_32 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:65) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4) +#loc24 = loc("xnumel"(#loc1)) +#loc25 = loc("xoffset"(#loc2)) +#loc26 = loc("xoffset"(#loc3)) +#loc27 = loc("xindex"(#loc4)) +#loc28 = loc("xindex"(#loc5)) +#loc29 = loc("xmask"(#loc6)) +#loc30 = loc("x0"(#loc7)) +#loc31 = loc("x1"(#loc8)) +#loc32 = loc("x1"(#loc9)) +#loc33 = loc("x2"(#loc10)) +#loc34 = loc("tmp0"(#loc11)) +#loc35 = loc("tmp0"(#loc12)) +#loc36 = loc("tmp0"(#loc13)) +#loc37 = loc("tmp0"(#loc14)) +#loc38 = loc("tmp0"(#loc15)) +#loc39 = loc("tmp0"(#loc16)) +#loc40 = loc("tmp0"(#loc17)) diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttgir b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..a263fc90fc68e8620403d752aaae32960c71c163 --- /dev/null +++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttgir @@ -0,0 +1,66 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<294912> : tensor<1024xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc22) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc23) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc24) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc25) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc25) + %x0 = arith.remsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc26) + %x1 = arith.divsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc27) + %x1_6 = arith.remsi %x1, %cst_0 : tensor<1024xi32, #blocked> loc(#loc28) + %x2 = arith.divsi %xindex_5, %cst_1 : tensor<1024xi32, #blocked> loc(#loc29) + %tmp0 = arith.muli %x2, %cst : tensor<1024xi32, #blocked> loc(#loc30) + %tmp0_7 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc31) + %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<1024xi32, #blocked> loc(#loc32) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32, #blocked> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr, #blocked> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc16) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr, #blocked> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4) +#loc22 = loc("xoffset"(#loc2)) +#loc23 = loc("xoffset"(#loc3)) +#loc24 = loc("xindex"(#loc4)) +#loc25 = loc("xindex"(#loc5)) +#loc26 = loc("x0"(#loc6)) +#loc27 = loc("x1"(#loc7)) +#loc28 = loc("x1"(#loc8)) +#loc29 = loc("x2"(#loc9)) +#loc30 = loc("tmp0"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttir b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..91cbecaea485ac121c1028ec64dfaeaab96f36be --- /dev/null +++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttir @@ -0,0 +1,65 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc22) + %x2 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc23) + %x1 = arith.constant dense<32> : tensor<1024xi32> loc(#loc24) + %cst = arith.constant dense<128> : tensor<1024xi32> loc(#loc4) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc4) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc26) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc28) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc28) + %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32> loc(#loc29) + %x1_3 = arith.divsi %xindex_2, %cst : tensor<1024xi32> loc(#loc30) + %x1_4 = arith.remsi %x1_3, %x1 : tensor<1024xi32> loc(#loc24) + %x2_5 = arith.divsi %xindex_2, %x2 : tensor<1024xi32> loc(#loc23) + %tmp0_6 = arith.muli %x2_5, %cst : tensor<1024xi32> loc(#loc31) + %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<1024xi32> loc(#loc32) + %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<1024xi32> loc(#loc22) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc16) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28) +#loc4 = loc(unknown) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4) +#loc22 = loc("tmp0"(#loc1)) +#loc23 = loc("x2"(#loc2)) +#loc24 = loc("x1"(#loc3)) +#loc25 = loc("xoffset"(#loc5)) +#loc26 = loc("xoffset"(#loc6)) +#loc27 = loc("xindex"(#loc7)) +#loc28 = loc("xindex"(#loc8)) +#loc29 = loc("x0"(#loc9)) +#loc30 = loc("x1"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..569dcdf957ffed86f3be8853d17d08b5fc2b4bd3 --- /dev/null +++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.json"}} \ No newline at end of file diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..880e547d63b4727dc6ca3e49e07795249ebac9a2 Binary files /dev/null and b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.cubin differ diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..67bd342c52ef8d3d0c858b23a4ec778f075123a7 --- /dev/null +++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"hash": "6471e8f3875bf89a78d7edcf8a51a68796b0c540783d96a5f1c61d210b2fb01f", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"} \ No newline at end of file diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..4d8c7a489efd29644f0f5a29ec68874c4a81affc --- /dev/null +++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.llir @@ -0,0 +1,547 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 { +__nv_rsqrtf.exit: + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %9 = icmp samesign ult i32 %8, 2048, !dbg !9 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %11 = and i32 %10, 511, !dbg !10 + %12 = and i32 %10, 31, !dbg !10 + %13 = lshr i32 %11, 5, !dbg !10 + %14 = shl nuw nsw i32 %10, 3, !dbg !10 + %15 = and i32 %14, 4088, !dbg !10 + %16 = shl i32 %8, 12, !dbg !11 + %17 = or disjoint i32 %15, %16, !dbg !12 + %18 = sext i32 %17 to i64, !dbg !13 + %19 = getelementptr bfloat, ptr addrspace(1) %0, i64 %18, !dbg !13 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %20, i1 %9) #6, !dbg !14 + %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !14 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !14 + %24 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !14 + %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !14 + %26 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !14 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !14 + %28 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !14 + %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !14 + %30 = extractelement <2 x bfloat> %23, i64 0, !dbg !14 + %31 = extractelement <2 x bfloat> %23, i64 1, !dbg !14 + %32 = extractelement <2 x bfloat> %25, i64 0, !dbg !14 + %33 = extractelement <2 x bfloat> %25, i64 1, !dbg !14 + %34 = extractelement <2 x bfloat> %27, i64 0, !dbg !14 + %35 = extractelement <2 x bfloat> %27, i64 1, !dbg !14 + %36 = extractelement <2 x bfloat> %29, i64 0, !dbg !14 + %37 = extractelement <2 x bfloat> %29, i64 1, !dbg !14 + %38 = fpext bfloat %30 to float, !dbg !15 + %39 = fpext bfloat %31 to float, !dbg !15 + %40 = fpext bfloat %32 to float, !dbg !15 + %41 = fpext bfloat %33 to float, !dbg !15 + %42 = fpext bfloat %34 to float, !dbg !15 + %43 = fpext bfloat %35 to float, !dbg !15 + %44 = fpext bfloat %36 to float, !dbg !15 + %45 = fpext bfloat %37 to float, !dbg !15 + %46 = select i1 %9, float %38, float 0.000000e+00, !dbg !16 + %47 = select i1 %9, float %39, float 0.000000e+00, !dbg !16 + %48 = select i1 %9, float %40, float 0.000000e+00, !dbg !16 + %49 = select i1 %9, float %41, float 0.000000e+00, !dbg !16 + %50 = select i1 %9, float %42, float 0.000000e+00, !dbg !16 + %51 = select i1 %9, float %43, float 0.000000e+00, !dbg !16 + %52 = select i1 %9, float %44, float 0.000000e+00, !dbg !16 + %53 = select i1 %9, float %45, float 0.000000e+00, !dbg !16 + %54 = select i1 %9, float 1.000000e+00, float 0.000000e+00, !dbg !17 + %55 = fsub float %47, %46, !dbg !18 + %56 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !24 + %57 = fcmp oeq float %56, 0.000000e+00, !dbg !25 + %58 = tail call float @llvm.nvvm.div.full(float %54, float %56), !dbg !26 + %59 = select i1 %57, float 0.000000e+00, float %58, !dbg !27 + %60 = fmul float %59, %55, !dbg !28 + %61 = fadd float %46, %60, !dbg !29 + %62 = fmul float %55, %55, !dbg !30 + %63 = fmul float %54, %62, !dbg !31 + %64 = fmul float %59, %63, !dbg !32 + %65 = fadd float %64, 0.000000e+00, !dbg !33 + %66 = fsub float %48, %61, !dbg !18 + %67 = select i1 %9, float 3.000000e+00, float 0.000000e+00, !dbg !24 + %68 = fcmp oeq float %67, 0.000000e+00, !dbg !25 + %69 = tail call float @llvm.nvvm.div.full(float %54, float %67), !dbg !26 + %70 = select i1 %68, float 0.000000e+00, float %69, !dbg !27 + %71 = fmul float %70, %66, !dbg !28 + %72 = fadd float %61, %71, !dbg !29 + %73 = fmul float %66, %66, !dbg !30 + %74 = fmul float %56, %73, !dbg !31 + %75 = fmul float %70, %74, !dbg !32 + %76 = fadd float %65, %75, !dbg !33 + %77 = fsub float %49, %72, !dbg !18 + %78 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !24 + %79 = fcmp oeq float %78, 0.000000e+00, !dbg !25 + %80 = tail call float @llvm.nvvm.div.full(float %54, float %78), !dbg !26 + %81 = select i1 %79, float 0.000000e+00, float %80, !dbg !27 + %82 = fmul float %81, %77, !dbg !28 + %83 = fadd float %72, %82, !dbg !29 + %84 = fmul float %77, %77, !dbg !30 + %85 = fmul float %67, %84, !dbg !31 + %86 = fmul float %81, %85, !dbg !32 + %87 = fadd float %76, %86, !dbg !33 + %88 = fsub float %50, %83, !dbg !18 + %89 = select i1 %9, float 5.000000e+00, float 0.000000e+00, !dbg !24 + %90 = fcmp oeq float %89, 0.000000e+00, !dbg !25 + %91 = tail call float @llvm.nvvm.div.full(float %54, float %89), !dbg !26 + %92 = select i1 %90, float 0.000000e+00, float %91, !dbg !27 + %93 = fmul float %92, %88, !dbg !28 + %94 = fadd float %83, %93, !dbg !29 + %95 = fmul float %88, %88, !dbg !30 + %96 = fmul float %78, %95, !dbg !31 + %97 = fmul float %92, %96, !dbg !32 + %98 = fadd float %87, %97, !dbg !33 + %99 = fsub float %51, %94, !dbg !18 + %100 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !24 + %101 = fcmp oeq float %100, 0.000000e+00, !dbg !25 + %102 = tail call float @llvm.nvvm.div.full(float %54, float %100), !dbg !26 + %103 = select i1 %101, float 0.000000e+00, float %102, !dbg !27 + %104 = fmul float %103, %99, !dbg !28 + %105 = fadd float %94, %104, !dbg !29 + %106 = fmul float %99, %99, !dbg !30 + %107 = fmul float %89, %106, !dbg !31 + %108 = fmul float %103, %107, !dbg !32 + %109 = fadd float %98, %108, !dbg !33 + %110 = fsub float %52, %105, !dbg !18 + %111 = select i1 %9, float 7.000000e+00, float 0.000000e+00, !dbg !24 + %112 = fcmp oeq float %111, 0.000000e+00, !dbg !25 + %113 = tail call float @llvm.nvvm.div.full(float %54, float %111), !dbg !26 + %114 = select i1 %112, float 0.000000e+00, float %113, !dbg !27 + %115 = fmul float %114, %110, !dbg !28 + %116 = fadd float %105, %115, !dbg !29 + %117 = fmul float %110, %110, !dbg !30 + %118 = fmul float %100, %117, !dbg !31 + %119 = fmul float %114, %118, !dbg !32 + %120 = fadd float %109, %119, !dbg !33 + %121 = fsub float %53, %116, !dbg !18 + %122 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !24 + %123 = fcmp oeq float %122, 0.000000e+00, !dbg !25 + %124 = tail call float @llvm.nvvm.div.full(float %54, float %122), !dbg !26 + %125 = select i1 %123, float 0.000000e+00, float %124, !dbg !27 + %126 = fmul float %125, %121, !dbg !28 + %127 = fadd float %116, %126, !dbg !29 + %128 = fmul float %121, %121, !dbg !30 + %129 = fmul float %111, %128, !dbg !31 + %130 = fmul float %125, %129, !dbg !32 + %131 = fadd float %120, %130, !dbg !33 + %132 = bitcast float %127 to i32, !dbg !21 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !21 + %134 = bitcast i32 %133 to float, !dbg !21 + %135 = bitcast float %131 to i32, !dbg !21 + %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 16, i32 31), !dbg !21 + %137 = bitcast i32 %136 to float, !dbg !21 + %138 = bitcast float %122 to i32, !dbg !21 + %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !21 + %140 = bitcast i32 %139 to float, !dbg !21 + %141 = fsub float %134, %127, !dbg !18 + %142 = fadd float %122, %140, !dbg !24 + %143 = fcmp oeq float %142, 0.000000e+00, !dbg !25 + %144 = tail call float @llvm.nvvm.div.full(float %140, float %142), !dbg !26 + %145 = select i1 %143, float 0.000000e+00, float %144, !dbg !27 + %146 = fmul float %145, %141, !dbg !28 + %147 = fadd float %127, %146, !dbg !29 + %148 = fadd float %131, %137, !dbg !34 + %149 = fmul float %141, %141, !dbg !30 + %150 = fmul float %122, %149, !dbg !31 + %151 = fmul float %145, %150, !dbg !32 + %152 = fadd float %148, %151, !dbg !33 + %153 = bitcast float %147 to i32, !dbg !21 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !21 + %155 = bitcast i32 %154 to float, !dbg !21 + %156 = bitcast float %152 to i32, !dbg !21 + %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 8, i32 31), !dbg !21 + %158 = bitcast i32 %157 to float, !dbg !21 + %159 = bitcast float %142 to i32, !dbg !21 + %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 8, i32 31), !dbg !21 + %161 = bitcast i32 %160 to float, !dbg !21 + %162 = fsub float %155, %147, !dbg !18 + %163 = fadd float %142, %161, !dbg !24 + %164 = fcmp oeq float %163, 0.000000e+00, !dbg !25 + %165 = tail call float @llvm.nvvm.div.full(float %161, float %163), !dbg !26 + %166 = select i1 %164, float 0.000000e+00, float %165, !dbg !27 + %167 = fmul float %166, %162, !dbg !28 + %168 = fadd float %147, %167, !dbg !29 + %169 = fadd float %152, %158, !dbg !34 + %170 = fmul float %162, %162, !dbg !30 + %171 = fmul float %142, %170, !dbg !31 + %172 = fmul float %166, %171, !dbg !32 + %173 = fadd float %169, %172, !dbg !33 + %174 = bitcast float %168 to i32, !dbg !21 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !21 + %176 = bitcast i32 %175 to float, !dbg !21 + %177 = bitcast float %173 to i32, !dbg !21 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 4, i32 31), !dbg !21 + %179 = bitcast i32 %178 to float, !dbg !21 + %180 = bitcast float %163 to i32, !dbg !21 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 4, i32 31), !dbg !21 + %182 = bitcast i32 %181 to float, !dbg !21 + %183 = fsub float %176, %168, !dbg !18 + %184 = fadd float %163, %182, !dbg !24 + %185 = fcmp oeq float %184, 0.000000e+00, !dbg !25 + %186 = tail call float @llvm.nvvm.div.full(float %182, float %184), !dbg !26 + %187 = select i1 %185, float 0.000000e+00, float %186, !dbg !27 + %188 = fmul float %187, %183, !dbg !28 + %189 = fadd float %168, %188, !dbg !29 + %190 = fadd float %173, %179, !dbg !34 + %191 = fmul float %183, %183, !dbg !30 + %192 = fmul float %163, %191, !dbg !31 + %193 = fmul float %187, %192, !dbg !32 + %194 = fadd float %190, %193, !dbg !33 + %195 = bitcast float %189 to i32, !dbg !21 + %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !21 + %197 = bitcast i32 %196 to float, !dbg !21 + %198 = bitcast float %194 to i32, !dbg !21 + %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 2, i32 31), !dbg !21 + %200 = bitcast i32 %199 to float, !dbg !21 + %201 = bitcast float %184 to i32, !dbg !21 + %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 2, i32 31), !dbg !21 + %203 = bitcast i32 %202 to float, !dbg !21 + %204 = fsub float %197, %189, !dbg !18 + %205 = fadd float %184, %203, !dbg !24 + %206 = fcmp oeq float %205, 0.000000e+00, !dbg !25 + %207 = tail call float @llvm.nvvm.div.full(float %203, float %205), !dbg !26 + %208 = select i1 %206, float 0.000000e+00, float %207, !dbg !27 + %209 = fmul float %208, %204, !dbg !28 + %210 = fadd float %189, %209, !dbg !29 + %211 = fadd float %194, %200, !dbg !34 + %212 = fmul float %204, %204, !dbg !30 + %213 = fmul float %184, %212, !dbg !31 + %214 = fmul float %208, %213, !dbg !32 + %215 = fadd float %211, %214, !dbg !33 + %216 = bitcast float %210 to i32, !dbg !21 + %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !21 + %218 = bitcast i32 %217 to float, !dbg !21 + %219 = bitcast float %215 to i32, !dbg !21 + %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !21 + %221 = bitcast i32 %220 to float, !dbg !21 + %222 = bitcast float %205 to i32, !dbg !21 + %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !21 + %224 = bitcast i32 %223 to float, !dbg !21 + %225 = fsub float %218, %210, !dbg !18 + %226 = fadd float %205, %224, !dbg !24 + %227 = fcmp oeq float %226, 0.000000e+00, !dbg !25 + %228 = tail call float @llvm.nvvm.div.full(float %224, float %226), !dbg !26 + %229 = select i1 %227, float 0.000000e+00, float %228, !dbg !27 + %230 = fmul float %229, %225, !dbg !28 + %231 = fadd float %210, %230, !dbg !29 + %232 = fadd float %215, %221, !dbg !34 + %233 = fmul float %225, %225, !dbg !30 + %234 = fmul float %205, %233, !dbg !31 + %235 = fmul float %229, %234, !dbg !32 + %236 = fadd float %232, %235, !dbg !33 + %237 = icmp eq i32 %12, 0, !dbg !21 + %238 = getelementptr float, ptr addrspace(3) @global_smem, i32 %13, !dbg !21 + %239 = bitcast float %231 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %238, <1 x i32> %239, i1 %237) #6, !dbg !21 + %240 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %13, !dbg !21 + %241 = bitcast float %236 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %240, <1 x i32> %241, i1 %237) #6, !dbg !21 + %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %13, !dbg !21 + %243 = bitcast float %226 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %243, i1 %237) #6, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %244 = icmp samesign ult i32 %11, 16, !dbg !21 + %245 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !21 + %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %244) #6, !dbg !21 + %247 = bitcast i32 %246 to float, !dbg !21 + %248 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %11, !dbg !21 + %249 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %248, i1 %244) #6, !dbg !21 + %250 = bitcast i32 %249 to float, !dbg !21 + %251 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %11, !dbg !21 + %252 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %251, i1 %244) #6, !dbg !21 + %253 = bitcast i32 %252 to float, !dbg !21 + %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !21 + %255 = bitcast i32 %254 to float, !dbg !21 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 8, i32 31), !dbg !21 + %257 = bitcast i32 %256 to float, !dbg !21 + %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 8, i32 31), !dbg !21 + %259 = bitcast i32 %258 to float, !dbg !21 + %260 = fsub float %255, %247, !dbg !18 + %261 = fadd float %253, %259, !dbg !24 + %262 = fcmp oeq float %261, 0.000000e+00, !dbg !25 + %263 = tail call float @llvm.nvvm.div.full(float %259, float %261), !dbg !26 + %264 = select i1 %262, float 0.000000e+00, float %263, !dbg !27 + %265 = fmul float %260, %264, !dbg !28 + %266 = fadd float %265, %247, !dbg !29 + %267 = fadd float %250, %257, !dbg !34 + %268 = fmul float %260, %260, !dbg !30 + %269 = fmul float %268, %253, !dbg !31 + %270 = fmul float %269, %264, !dbg !32 + %271 = fadd float %267, %270, !dbg !33 + %272 = bitcast float %266 to i32, !dbg !21 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !21 + %274 = bitcast i32 %273 to float, !dbg !21 + %275 = bitcast float %271 to i32, !dbg !21 + %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %275, i32 4, i32 31), !dbg !21 + %277 = bitcast i32 %276 to float, !dbg !21 + %278 = bitcast float %261 to i32, !dbg !21 + %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 4, i32 31), !dbg !21 + %280 = bitcast i32 %279 to float, !dbg !21 + %281 = fsub float %274, %266, !dbg !18 + %282 = fadd float %261, %280, !dbg !24 + %283 = fcmp oeq float %282, 0.000000e+00, !dbg !25 + %284 = tail call float @llvm.nvvm.div.full(float %280, float %282), !dbg !26 + %285 = select i1 %283, float 0.000000e+00, float %284, !dbg !27 + %286 = fmul float %281, %285, !dbg !28 + %287 = fadd float %266, %286, !dbg !29 + %288 = fadd float %271, %277, !dbg !34 + %289 = fmul float %281, %281, !dbg !30 + %290 = fmul float %261, %289, !dbg !31 + %291 = fmul float %285, %290, !dbg !32 + %292 = fadd float %288, %291, !dbg !33 + %293 = bitcast float %287 to i32, !dbg !21 + %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !21 + %295 = bitcast i32 %294 to float, !dbg !21 + %296 = bitcast float %292 to i32, !dbg !21 + %297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 2, i32 31), !dbg !21 + %298 = bitcast i32 %297 to float, !dbg !21 + %299 = bitcast float %282 to i32, !dbg !21 + %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 2, i32 31), !dbg !21 + %301 = bitcast i32 %300 to float, !dbg !21 + %302 = fsub float %295, %287, !dbg !18 + %303 = fadd float %282, %301, !dbg !24 + %304 = fcmp oeq float %303, 0.000000e+00, !dbg !25 + %305 = tail call float @llvm.nvvm.div.full(float %301, float %303), !dbg !26 + %306 = select i1 %304, float 0.000000e+00, float %305, !dbg !27 + %307 = fmul float %302, %306, !dbg !28 + %308 = fadd float %287, %307, !dbg !29 + %309 = fadd float %292, %298, !dbg !34 + %310 = fmul float %302, %302, !dbg !30 + %311 = fmul float %282, %310, !dbg !31 + %312 = fmul float %306, %311, !dbg !32 + %313 = fadd float %309, %312, !dbg !33 + %314 = bitcast float %308 to i32, !dbg !21 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !21 + %316 = bitcast i32 %315 to float, !dbg !21 + %317 = bitcast float %313 to i32, !dbg !21 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 1, i32 31), !dbg !21 + %319 = bitcast i32 %318 to float, !dbg !21 + %320 = bitcast float %303 to i32, !dbg !21 + %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %320, i32 1, i32 31), !dbg !21 + %322 = bitcast i32 %321 to float, !dbg !21 + %323 = fsub float %316, %308, !dbg !18 + %324 = fadd float %303, %322, !dbg !24 + %325 = fcmp oeq float %324, 0.000000e+00, !dbg !25 + %326 = tail call float @llvm.nvvm.div.full(float %322, float %324), !dbg !26 + %327 = select i1 %325, float 0.000000e+00, float %326, !dbg !27 + %328 = fmul float %323, %327, !dbg !28 + %329 = fadd float %308, %328, !dbg !29 + %330 = fadd float %313, %319, !dbg !34 + %331 = fmul float %323, %323, !dbg !30 + %332 = fmul float %303, %331, !dbg !31 + %333 = fmul float %327, %332, !dbg !32 + %334 = fadd float %330, %333, !dbg !33 + %335 = and i32 %10, 15, !dbg !21 + %336 = icmp eq i32 %335, 0, !dbg !21 + %337 = and i1 %244, %336, !dbg !21 + %338 = bitcast float %329 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %338, i1 %337) #6, !dbg !21 + %339 = bitcast float %334 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %248, <1 x i32> %339, i1 %337) #6, !dbg !21 + %340 = bitcast float %324 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, <1 x i32> %340, i1 %337) #6, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %341 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !21 + %342 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !21 + %343 = zext nneg i32 %15 to i64, !dbg !35 + %344 = getelementptr bfloat, ptr addrspace(1) %1, i64 %343, !dbg !35 + %345 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !36 + %346 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %344, i64 %345, i1 true) #6, !dbg !36 + %347 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !37 + %348 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %347, i1 %9) #6, !dbg !37 + %349 = getelementptr bfloat, ptr addrspace(1) %2, i64 %343, !dbg !38 + %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !39 + %351 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !39 + %352 = tail call float @llvm.nvvm.div.full(float %342, float 4.096000e+03), !dbg !40 + %353 = fadd float %352, 0x3EB0C6F7A0000000, !dbg !41 + %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %356 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %.not.i19 = icmp eq i32 %361, 0, !dbg !42 + br i1 %.not.i19, label %364, label %362, !dbg !42 + +362: ; preds = %__nv_rsqrtf.exit + %363 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %353), !dbg !42 + br label %__nv_rsqrtf.exit21, !dbg !42 + +364: ; preds = %__nv_rsqrtf.exit + %365 = tail call float @llvm.nvvm.rsqrt.approx.f(float %353), !dbg !42 + br label %__nv_rsqrtf.exit21, !dbg !42 + +__nv_rsqrtf.exit21: ; preds = %362, %364 + %.0.i20 = phi float [ %363, %362 ], [ %365, %364 ], !dbg !42 + %366 = extractvalue { i32, i32, i32, i32 } %348, 3, !dbg !37 + %367 = bitcast i32 %366 to <2 x bfloat>, !dbg !37 + %368 = extractvalue { i32, i32, i32, i32 } %348, 2, !dbg !37 + %369 = bitcast i32 %368 to <2 x bfloat>, !dbg !37 + %370 = extractvalue { i32, i32, i32, i32 } %348, 1, !dbg !37 + %371 = bitcast i32 %370 to <2 x bfloat>, !dbg !37 + %372 = extractvalue { i32, i32, i32, i32 } %348, 0, !dbg !37 + %373 = bitcast i32 %372 to <2 x bfloat>, !dbg !37 + %374 = extractvalue { i32, i32, i32, i32 } %346, 3, !dbg !36 + %375 = bitcast i32 %374 to <2 x bfloat>, !dbg !36 + %376 = extractvalue { i32, i32, i32, i32 } %346, 2, !dbg !36 + %377 = bitcast i32 %376 to <2 x bfloat>, !dbg !36 + %378 = extractvalue { i32, i32, i32, i32 } %346, 1, !dbg !36 + %379 = bitcast i32 %378 to <2 x bfloat>, !dbg !36 + %380 = extractvalue { i32, i32, i32, i32 } %346, 0, !dbg !36 + %381 = bitcast i32 %380 to <2 x bfloat>, !dbg !36 + %382 = extractvalue { i32, i32, i32, i32 } %351, 3, !dbg !39 + %383 = bitcast i32 %382 to <2 x bfloat>, !dbg !39 + %384 = extractvalue { i32, i32, i32, i32 } %351, 2, !dbg !39 + %385 = bitcast i32 %384 to <2 x bfloat>, !dbg !39 + %386 = extractvalue { i32, i32, i32, i32 } %351, 1, !dbg !39 + %387 = bitcast i32 %386 to <2 x bfloat>, !dbg !39 + %388 = extractvalue { i32, i32, i32, i32 } %351, 0, !dbg !39 + %389 = bitcast i32 %388 to <2 x bfloat>, !dbg !39 + %390 = getelementptr bfloat, ptr addrspace(1) %3, i64 %18, !dbg !43 + %391 = fpext <2 x bfloat> %373 to <2 x float>, !dbg !44 + %392 = insertelement <2 x float> poison, float %341, i64 0, !dbg !45 + %393 = shufflevector <2 x float> %392, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !45 + %394 = fsub <2 x float> %391, %393, !dbg !45 + %395 = fpext <2 x bfloat> %381 to <2 x float>, !dbg !46 + %396 = fadd <2 x float> %395, splat (float 1.000000e+00), !dbg !47 + %397 = fpext <2 x bfloat> %389 to <2 x float>, !dbg !48 + %398 = insertelement <2 x float> poison, float %.0.i20, i64 0, !dbg !49 + %399 = shufflevector <2 x float> %398, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !49 + %400 = fmul <2 x float> %394, %399, !dbg !49 + %401 = fmul <2 x float> %396, %400, !dbg !50 + %402 = fadd <2 x float> %401, %397, !dbg !51 + %403 = fptrunc <2 x float> %402 to <2 x bfloat>, !dbg !52 + %404 = fpext <2 x bfloat> %371 to <2 x float>, !dbg !44 + %405 = fsub <2 x float> %404, %393, !dbg !45 + %406 = fpext <2 x bfloat> %379 to <2 x float>, !dbg !46 + %407 = fadd <2 x float> %406, splat (float 1.000000e+00), !dbg !47 + %408 = fpext <2 x bfloat> %387 to <2 x float>, !dbg !48 + %409 = fmul <2 x float> %405, %399, !dbg !49 + %410 = fmul <2 x float> %407, %409, !dbg !50 + %411 = fadd <2 x float> %410, %408, !dbg !51 + %412 = fptrunc <2 x float> %411 to <2 x bfloat>, !dbg !52 + %413 = fpext <2 x bfloat> %369 to <2 x float>, !dbg !44 + %414 = fsub <2 x float> %413, %393, !dbg !45 + %415 = fpext <2 x bfloat> %377 to <2 x float>, !dbg !46 + %416 = fadd <2 x float> %415, splat (float 1.000000e+00), !dbg !47 + %417 = fpext <2 x bfloat> %385 to <2 x float>, !dbg !48 + %418 = fmul <2 x float> %414, %399, !dbg !49 + %419 = fmul <2 x float> %416, %418, !dbg !50 + %420 = fadd <2 x float> %419, %417, !dbg !51 + %421 = fptrunc <2 x float> %420 to <2 x bfloat>, !dbg !52 + %422 = fpext <2 x bfloat> %367 to <2 x float>, !dbg !44 + %423 = fsub <2 x float> %422, %393, !dbg !45 + %424 = fpext <2 x bfloat> %375 to <2 x float>, !dbg !46 + %425 = fadd <2 x float> %424, splat (float 1.000000e+00), !dbg !47 + %426 = fpext <2 x bfloat> %383 to <2 x float>, !dbg !48 + %427 = fmul <2 x float> %423, %399, !dbg !49 + %428 = fmul <2 x float> %425, %427, !dbg !50 + %429 = fadd <2 x float> %428, %426, !dbg !51 + %430 = fptrunc <2 x float> %429 to <2 x bfloat>, !dbg !52 + %431 = bitcast <2 x bfloat> %403 to i32, !dbg !52 + %432 = bitcast <2 x bfloat> %412 to i32, !dbg !52 + %433 = bitcast <2 x bfloat> %421 to i32, !dbg !52 + %434 = bitcast <2 x bfloat> %430 to i32, !dbg !52 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %431, i32 %432, i32 %433, i32 %434, ptr addrspace(1) %390, i1 %9) #6, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 25, column: 21, scope: !5) +!10 = !DILocation(line: 26, column: 37, scope: !5) +!11 = !DILocation(line: 38, column: 46, scope: !5) +!12 = !DILocation(line: 38, column: 41, scope: !5) +!13 = !DILocation(line: 38, column: 34, scope: !5) +!14 = !DILocation(line: 38, column: 51, scope: !5) +!15 = !DILocation(line: 38, column: 112, scope: !5) +!16 = !DILocation(line: 44, column: 62, scope: !5) +!17 = !DILocation(line: 46, column: 66, scope: !5) +!18 = !DILocation(line: 231, column: 21, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !5, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 243, column: 46, scope: !19, inlinedAt: !22) +!22 = !DILocation(line: 47, column: 79, scope: !23) +!23 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!24 = !DILocation(line: 232, column: 28, scope: !19, inlinedAt: !21) +!25 = !DILocation(line: 233, column: 39, scope: !19, inlinedAt: !21) +!26 = !DILocation(line: 233, column: 60, scope: !19, inlinedAt: !21) +!27 = !DILocation(line: 233, column: 49, scope: !19, inlinedAt: !21) +!28 = !DILocation(line: 235, column: 25, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 235, column: 17, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 236, column: 30, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 236, column: 38, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 236, column: 49, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 236, column: 22, scope: !19, inlinedAt: !21) +!34 = !DILocation(line: 236, column: 15, scope: !19, inlinedAt: !21) +!35 = !DILocation(line: 57, column: 34, scope: !5) +!36 = !DILocation(line: 57, column: 41, scope: !5) +!37 = !DILocation(line: 58, column: 52, scope: !5) +!38 = !DILocation(line: 59, column: 35, scope: !5) +!39 = !DILocation(line: 59, column: 42, scope: !5) +!40 = !DILocation(line: 65, column: 24, scope: !5) +!41 = !DILocation(line: 67, column: 24, scope: !5) +!42 = !DILocation(line: 68, column: 32, scope: !5) +!43 = !DILocation(line: 73, column: 29, scope: !5) +!44 = !DILocation(line: 58, column: 114, scope: !5) +!45 = !DILocation(line: 63, column: 24, scope: !5) +!46 = !DILocation(line: 57, column: 94, scope: !5) +!47 = !DILocation(line: 61, column: 23, scope: !5) +!48 = !DILocation(line: 59, column: 95, scope: !5) +!49 = !DILocation(line: 69, column: 24, scope: !5) +!50 = !DILocation(line: 71, column: 24, scope: !5) +!51 = !DILocation(line: 72, column: 24, scope: !5) +!52 = !DILocation(line: 73, column: 53, scope: !5) +!53 = !DILocation(line: 51, column: 4, scope: !5) diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..2788ffc3798a0f8a72f374d735e4a318dea4886c --- /dev/null +++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ptx @@ -0,0 +1,1032 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused_add_mul_native_layer_norm_0 +.visible .entry triton_red_fused_add_mul_native_layer_norm_0( + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_4, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_7 +) +.reqntid 512 +{ + .reg .pred %p<23>; + .reg .b16 %rs<33>; + .reg .b32 %r<287>; + .reg .b64 %rd<15>; + .loc 1 18 0 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd9, [triton_red_fused_add_mul_native_layer_norm_0_param_0]; + ld.param.b64 %rd10, [triton_red_fused_add_mul_native_layer_norm_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:23:28 + mov.u32 %r37, %ctaid.x; + .loc 1 25 21 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:25:21 + setp.lt.u32 %p1, %r37, 2048; + ld.param.b64 %rd11, [triton_red_fused_add_mul_native_layer_norm_0_param_2]; + ld.param.b64 %rd12, [triton_red_fused_add_mul_native_layer_norm_0_param_3]; + .loc 1 26 37 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:26:37 + mov.u32 %r38, %tid.x; + and.b32 %r39, %r38, 511; + and.b32 %r40, %r38, 31; + shl.b32 %r41, %r38, 3; + and.b32 %r42, %r41, 4088; + .loc 1 38 46 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:46 + shl.b32 %r43, %r37, 12; + .loc 1 38 41 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:41 + or.b32 %r44, %r42, %r43; + .loc 1 38 34 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:34 + mul.wide.s32 %rd13, %r44, 2; + add.s64 %rd1, %rd9, %rd13; + .loc 1 38 51 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:51 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + mov.b32 {%rs5, %rs6}, %r3; + mov.b32 {%rs7, %rs8}, %r4; + .loc 1 38 112 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112 + cvt.f32.bf16 %r45, %rs1; + cvt.f32.bf16 %r46, %rs2; + cvt.f32.bf16 %r47, %rs3; + cvt.f32.bf16 %r48, %rs4; + cvt.f32.bf16 %r49, %rs5; + cvt.f32.bf16 %r50, %rs6; + cvt.f32.bf16 %r51, %rs7; + cvt.f32.bf16 %r52, %rs8; + .loc 1 44 62 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:44:62 + selp.f32 %r53, %r45, 0f00000000, %p1; + selp.f32 %r54, %r46, 0f00000000, %p1; + selp.f32 %r55, %r47, 0f00000000, %p1; + selp.f32 %r56, %r48, 0f00000000, %p1; + selp.f32 %r57, %r49, 0f00000000, %p1; + selp.f32 %r58, %r50, 0f00000000, %p1; + selp.f32 %r59, %r51, 0f00000000, %p1; + selp.f32 %r60, %r52, 0f00000000, %p1; + .loc 1 46 66 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:46:66 + selp.f32 %r61, 0f3F800000, 0f00000000, %p1; +$L__tmp1: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r62, %r54, %r53; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r63, 0f40000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p6, %r63, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r64, %r61, %r63; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r65, 0f00000000, %r64, %p6; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r66, %r65, %r62, %r53; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r67, %r62, %r62; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r68, %r61, %r67; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r69, %r65, %r68, 0f00000000; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r70, %r55, %r66; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r71, 0f40400000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p7, %r71, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r72, %r61, %r71; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r73, 0f00000000, %r72, %p7; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r74, %r73, %r70, %r66; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r75, %r70, %r70; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r76, %r63, %r75; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r77, %r73, %r76, %r69; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r78, %r56, %r74; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r79, 0f40800000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p8, %r79, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r80, %r61, %r79; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r81, 0f00000000, %r80, %p8; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r82, %r81, %r78, %r74; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r83, %r78, %r78; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r84, %r71, %r83; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r85, %r81, %r84, %r77; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r86, %r57, %r82; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r87, 0f40A00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p9, %r87, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r88, %r61, %r87; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r89, 0f00000000, %r88, %p9; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r90, %r89, %r86, %r82; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r91, %r86, %r86; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r92, %r79, %r91; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r93, %r89, %r92, %r85; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r94, %r58, %r90; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r95, 0f40C00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p10, %r95, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r96, %r61, %r95; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r97, 0f00000000, %r96, %p10; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r98, %r97, %r94, %r90; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r99, %r94, %r94; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r100, %r87, %r99; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r101, %r97, %r100, %r93; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r102, %r59, %r98; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r103, 0f40E00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p11, %r103, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r104, %r61, %r103; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r105, 0f00000000, %r104, %p11; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r106, %r105, %r102, %r98; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r107, %r102, %r102; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r108, %r95, %r107; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r109, %r105, %r108, %r101; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r110, %r60, %r106; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r111, 0f41000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p12, %r111, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r112, %r61, %r111; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r113, 0f00000000, %r112, %p12; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r114, %r113, %r110, %r106; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r115, %r110, %r110; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r116, %r103, %r115; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r117, %r113, %r116, %r109; +$L__tmp2: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r118, %r114, 16, 31, -1; + shfl.sync.bfly.b32 %r119, %r117, 16, 31, -1; + shfl.sync.bfly.b32 %r120, %r111, 16, 31, -1; +$L__tmp3: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r121, %r118, %r114; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r122, %r111, %r120; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p13, %r122, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r123, %r120, %r122; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r124, 0f00000000, %r123, %p13; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r125, %r124, %r121, %r114; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r126, %r117, %r119; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r127, %r121, %r121; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r128, %r111, %r127; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r129, %r124, %r128, %r126; +$L__tmp4: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r130, %r125, 8, 31, -1; + shfl.sync.bfly.b32 %r131, %r129, 8, 31, -1; + shfl.sync.bfly.b32 %r132, %r122, 8, 31, -1; +$L__tmp5: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r133, %r130, %r125; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r134, %r122, %r132; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p14, %r134, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r135, %r132, %r134; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r136, 0f00000000, %r135, %p14; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r137, %r136, %r133, %r125; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r138, %r129, %r131; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r139, %r133, %r133; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r140, %r122, %r139; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r141, %r136, %r140, %r138; +$L__tmp6: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r142, %r137, 4, 31, -1; + shfl.sync.bfly.b32 %r143, %r141, 4, 31, -1; + shfl.sync.bfly.b32 %r144, %r134, 4, 31, -1; +$L__tmp7: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r145, %r142, %r137; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r146, %r134, %r144; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p15, %r146, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r147, %r144, %r146; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r148, 0f00000000, %r147, %p15; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r149, %r148, %r145, %r137; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r150, %r141, %r143; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r151, %r145, %r145; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r152, %r134, %r151; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r153, %r148, %r152, %r150; +$L__tmp8: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r154, %r149, 2, 31, -1; + shfl.sync.bfly.b32 %r155, %r153, 2, 31, -1; + shfl.sync.bfly.b32 %r156, %r146, 2, 31, -1; +$L__tmp9: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r157, %r154, %r149; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r158, %r146, %r156; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p16, %r158, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r159, %r156, %r158; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r160, 0f00000000, %r159, %p16; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r161, %r160, %r157, %r149; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r162, %r153, %r155; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r163, %r157, %r157; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r164, %r146, %r163; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r165, %r160, %r164, %r162; +$L__tmp10: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r166, %r161, 1, 31, -1; + shfl.sync.bfly.b32 %r167, %r165, 1, 31, -1; + shfl.sync.bfly.b32 %r168, %r158, 1, 31, -1; +$L__tmp11: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r169, %r166, %r161; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r11, %r158, %r168; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p17, %r11, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r170, %r168, %r11; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r171, 0f00000000, %r170, %p17; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r7, %r171, %r169, %r161; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r172, %r165, %r167; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r173, %r169, %r169; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r174, %r158, %r173; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r9, %r171, %r174, %r172; +$L__tmp12: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + setp.eq.b32 %p2, %r40, 0; + shr.u32 %r175, %r38, 3; + and.b32 %r176, %r175, 60; + mov.b32 %r177, global_smem; + add.s32 %r6, %r177, %r176; + // begin inline asm + @%p2 st.shared.b32 [ %r6 + 0 ], %r7; + // end inline asm + add.s32 %r8, %r6, 64; + // begin inline asm + @%p2 st.shared.b32 [ %r8 + 0 ], %r9; + // end inline asm + add.s32 %r10, %r6, 128; + // begin inline asm + @%p2 st.shared.b32 [ %r10 + 0 ], %r11; + // end inline asm + bar.sync 0; + setp.lt.u32 %p3, %r39, 16; + shl.b32 %r178, %r39, 2; + add.s32 %r13, %r177, %r178; + // begin inline asm + @%p3 ld.shared.b32 %r12, [ %r13 + 0 ]; + // end inline asm + add.s32 %r15, %r13, 64; + // begin inline asm + @%p3 ld.shared.b32 %r14, [ %r15 + 0 ]; + // end inline asm + add.s32 %r17, %r13, 128; + // begin inline asm + @%p3 ld.shared.b32 %r16, [ %r17 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r179, %r12, 8, 31, -1; + shfl.sync.bfly.b32 %r180, %r14, 8, 31, -1; + shfl.sync.bfly.b32 %r181, %r16, 8, 31, -1; +$L__tmp13: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r182, %r179, %r12; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r183, %r16, %r181; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p18, %r183, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r184, %r181, %r183; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r185, 0f00000000, %r184, %p18; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r186, %r182, %r185, %r12; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r187, %r14, %r180; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r188, %r182, %r182; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r189, %r188, %r16; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r190, %r189, %r185, %r187; +$L__tmp14: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r191, %r186, 4, 31, -1; + shfl.sync.bfly.b32 %r192, %r190, 4, 31, -1; + shfl.sync.bfly.b32 %r193, %r183, 4, 31, -1; +$L__tmp15: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r194, %r191, %r186; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r195, %r183, %r193; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p19, %r195, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r196, %r193, %r195; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r197, 0f00000000, %r196, %p19; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r198, %r194, %r197, %r186; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r199, %r190, %r192; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r200, %r194, %r194; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r201, %r183, %r200; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r202, %r197, %r201, %r199; +$L__tmp16: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r203, %r198, 2, 31, -1; + shfl.sync.bfly.b32 %r204, %r202, 2, 31, -1; + shfl.sync.bfly.b32 %r205, %r195, 2, 31, -1; +$L__tmp17: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r206, %r203, %r198; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r207, %r195, %r205; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p20, %r207, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r208, %r205, %r207; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r209, 0f00000000, %r208, %p20; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r210, %r206, %r209, %r198; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r211, %r202, %r204; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r212, %r206, %r206; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r213, %r195, %r212; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r214, %r209, %r213, %r211; +$L__tmp18: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + shfl.sync.bfly.b32 %r215, %r210, 1, 31, -1; + shfl.sync.bfly.b32 %r216, %r214, 1, 31, -1; + shfl.sync.bfly.b32 %r217, %r207, 1, 31, -1; +$L__tmp19: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + sub.f32 %r218, %r215, %r210; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r20, %r207, %r217; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + setp.eq.f32 %p21, %r20, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + div.full.f32 %r219, %r217, %r20; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + selp.f32 %r220, 0f00000000, %r219, %p21; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r18, %r218, %r220, %r210; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + add.f32 %r221, %r214, %r216; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r222, %r218, %r218; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + mul.f32 %r223, %r207, %r222; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ] + fma.rn.f32 %r19, %r220, %r223, %r221; +$L__tmp20: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] + and.b32 %r224, %r38, 15; + setp.eq.b32 %p22, %r224, 0; + and.pred %p4, %p3, %p22; + // begin inline asm + @%p4 st.shared.b32 [ %r13 + 0 ], %r18; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r15 + 0 ], %r19; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r17 + 0 ], %r20; + // end inline asm + bar.sync 0; + ld.shared.b32 %r225, [global_smem]; + ld.shared.b32 %r226, [global_smem+64]; +$L__tmp21: + .loc 1 57 34 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:34 + mul.wide.u32 %rd14, %r42, 2; + add.s64 %rd3, %rd10, %rd14; + .loc 1 57 41 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:41 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + mov.pred %p5, -1; + // begin inline asm + mov.u32 %r21, %r5; + mov.u32 %r22, %r5; + mov.u32 %r23, %r5; + mov.u32 %r24, %r5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r21, %r22, %r23, %r24 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 58 52 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:52 + // begin inline asm + mov.u64 %rd5, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd5, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r5; + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd1 + 0 ], %rd5; + // end inline asm + .loc 1 59 35 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:35 + add.s64 %rd6, %rd11, %rd14; + .loc 1 59 42 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:42 + // begin inline asm + mov.u64 %rd7, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r29, %r5; + mov.u32 %r30, %r5; + mov.u32 %r31, %r5; + mov.u32 %r32, %r5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r29, %r30, %r31, %r32 }, [ %rd6 + 0 ], %rd7; + // end inline asm + mov.b32 %r227, 0f45800000; + .loc 1 65 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:65:24 + div.full.f32 %r228, %r226, %r227; + .loc 1 67 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:67:24 + add.f32 %r229, %r228, 0f358637BD; + .loc 1 68 32 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:68:32 + rsqrt.approx.ftz.f32 %r230, %r229; + .loc 1 73 29 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:29 + add.s64 %rd8, %rd12, %rd13; + .loc 1 58 114 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114 + mov.b32 {%rs9, %rs10}, %r25; + cvt.f32.bf16 %r231, %rs10; + cvt.f32.bf16 %r232, %rs9; + .loc 1 63 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24 + sub.f32 %r233, %r232, %r225; + sub.f32 %r234, %r231, %r225; + .loc 1 57 94 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94 + mov.b32 {%rs11, %rs12}, %r21; + cvt.f32.bf16 %r235, %rs11; + cvt.f32.bf16 %r236, %rs12; + .loc 1 61 23 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23 + add.f32 %r237, %r236, 0f3F800000; + add.f32 %r238, %r235, 0f3F800000; + .loc 1 59 95 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95 + mov.b32 {%rs13, %rs14}, %r29; + cvt.f32.bf16 %r239, %rs14; + cvt.f32.bf16 %r240, %rs13; + .loc 1 69 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24 + mul.f32 %r241, %r234, %r230; + mul.f32 %r242, %r233, %r230; + .loc 1 72 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24 + fma.rn.f32 %r243, %r238, %r242, %r240; + fma.rn.f32 %r244, %r237, %r241, %r239; + .loc 1 73 53 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53 + cvt.rn.bf16x2.f32 %r33, %r244, %r243; + .loc 1 58 114 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114 + mov.b32 {%rs15, %rs16}, %r26; + cvt.f32.bf16 %r245, %rs16; + cvt.f32.bf16 %r246, %rs15; + .loc 1 63 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24 + sub.f32 %r247, %r246, %r225; + sub.f32 %r248, %r245, %r225; + .loc 1 57 94 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94 + mov.b32 {%rs17, %rs18}, %r22; + cvt.f32.bf16 %r249, %rs17; + cvt.f32.bf16 %r250, %rs18; + .loc 1 61 23 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23 + add.f32 %r251, %r250, 0f3F800000; + add.f32 %r252, %r249, 0f3F800000; + .loc 1 59 95 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95 + mov.b32 {%rs19, %rs20}, %r30; + cvt.f32.bf16 %r253, %rs20; + cvt.f32.bf16 %r254, %rs19; + .loc 1 69 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24 + mul.f32 %r255, %r248, %r230; + mul.f32 %r256, %r247, %r230; + .loc 1 72 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24 + fma.rn.f32 %r257, %r252, %r256, %r254; + fma.rn.f32 %r258, %r251, %r255, %r253; + .loc 1 73 53 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53 + cvt.rn.bf16x2.f32 %r34, %r258, %r257; + .loc 1 58 114 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114 + mov.b32 {%rs21, %rs22}, %r27; + cvt.f32.bf16 %r259, %rs22; + cvt.f32.bf16 %r260, %rs21; + .loc 1 63 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24 + sub.f32 %r261, %r260, %r225; + sub.f32 %r262, %r259, %r225; + .loc 1 57 94 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94 + mov.b32 {%rs23, %rs24}, %r23; + cvt.f32.bf16 %r263, %rs23; + cvt.f32.bf16 %r264, %rs24; + .loc 1 61 23 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23 + add.f32 %r265, %r264, 0f3F800000; + add.f32 %r266, %r263, 0f3F800000; + .loc 1 59 95 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95 + mov.b32 {%rs25, %rs26}, %r31; + cvt.f32.bf16 %r267, %rs26; + cvt.f32.bf16 %r268, %rs25; + .loc 1 69 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24 + mul.f32 %r269, %r262, %r230; + mul.f32 %r270, %r261, %r230; + .loc 1 72 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24 + fma.rn.f32 %r271, %r266, %r270, %r268; + fma.rn.f32 %r272, %r265, %r269, %r267; + .loc 1 73 53 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53 + cvt.rn.bf16x2.f32 %r35, %r272, %r271; + .loc 1 58 114 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114 + mov.b32 {%rs27, %rs28}, %r28; + cvt.f32.bf16 %r273, %rs28; + cvt.f32.bf16 %r274, %rs27; + .loc 1 63 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24 + sub.f32 %r275, %r274, %r225; + sub.f32 %r276, %r273, %r225; + .loc 1 57 94 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94 + mov.b32 {%rs29, %rs30}, %r24; + cvt.f32.bf16 %r277, %rs29; + cvt.f32.bf16 %r278, %rs30; + .loc 1 61 23 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23 + add.f32 %r279, %r278, 0f3F800000; + add.f32 %r280, %r277, 0f3F800000; + .loc 1 59 95 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95 + mov.b32 {%rs31, %rs32}, %r32; + cvt.f32.bf16 %r281, %rs32; + cvt.f32.bf16 %r282, %rs31; + .loc 1 69 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24 + mul.f32 %r283, %r276, %r230; + mul.f32 %r284, %r275, %r230; + .loc 1 72 24 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24 + fma.rn.f32 %r285, %r280, %r284, %r282; + fma.rn.f32 %r286, %r279, %r283, %r281; + .loc 1 73 53 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53 + cvt.rn.bf16x2.f32 %r36, %r286, %r285; + // begin inline asm + @%p1 st.global.v4.b32 [ %rd8 + 0 ], { %r33, %r34, %r35, %r36 }; + // end inline asm + .loc 1 51 4 // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:51:4 + ret; +$L__tmp22: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 343 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x150 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 119 +.b8 105 +.b8 122 +.b8 122 +.b8 106 +.b8 119 +.b8 109 +.b8 100 +.b8 52 +.b8 97 +.b8 106 +.b8 108 +.b8 117 +.b8 98 +.b8 120 +.b8 112 +.b8 118 +.b8 120 +.b8 105 +.b8 100 +.b8 106 +.b8 105 +.b8 121 +.b8 51 +.b8 108 +.b8 100 +.b8 118 +.b8 53 +.b8 101 +.b8 102 +.b8 108 +.b8 119 +.b8 108 +.b8 117 +.b8 100 +.b8 103 +.b8 105 +.b8 122 +.b8 99 +.b8 97 +.b8 104 +.b8 118 +.b8 115 +.b8 112 +.b8 52 +.b8 105 +.b8 55 +.b8 53 +.b8 115 +.b8 50 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 119 +.b8 119 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 97 +.b8 116 +.b8 105 +.b8 118 +.b8 101 +.b8 95 +.b8 108 +.b8 97 +.b8 121 +.b8 101 +.b8 114 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x113:0x47 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x128:0x31 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp21 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 47 // DW_AT_call_line +.b8 79 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x140:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp20 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 243 // DW_AT_call_line +.b8 46 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.source new file mode 100644 index 0000000000000000000000000000000000000000..20e39eee37abfa344439ff226f13b900d337b002 --- /dev/null +++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.source @@ -0,0 +1,420 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0) +#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0) +#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0) +#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0) +#loc91 = loc(unknown) +#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0) +#loc109 = loc("in_ptr0"(#loc)) +#loc110 = loc("in_ptr1"(#loc)) +#loc111 = loc("in_ptr2"(#loc)) +#loc112 = loc("out_ptr2"(#loc)) +#loc113 = loc("xnumel"(#loc)) +#loc114 = loc("r0_numel"(#loc)) +#loc171 = loc("value"(#loc72)) +#loc172 = loc("mean"(#loc72)) +#loc173 = loc("m2"(#loc72)) +#loc174 = loc("weight"(#loc72)) +#loc175 = loc("first_iteration"(#loc72)) +#loc185 = loc("input"(#loc85)) +#loc186 = loc("mean"(#loc89)) +#loc187 = loc("m2"(#loc89)) +#loc188 = loc("weight"(#loc89)) +#loc189 = loc("mean_1"(#loc94)) +#loc190 = loc("m2_1"(#loc94)) +#loc191 = loc("weight_1"(#loc94)) +#loc192 = loc("mean_2"(#loc94)) +#loc193 = loc("m2_2"(#loc94)) +#loc194 = loc("weight_2"(#loc94)) +#loc201 = loc("new_mean"(#loc171)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2048 : i32 loc(#loc115) + %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116) + %xoffset = tt.get_program_id x : i32 loc(#loc117) + %xoffset_2 = arith.constant 1 : i32 loc(#loc118) + %xoffset_3 = arith.constant 1 : i32 loc(#loc118) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121) + %xmask = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc122) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc123) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc124) + %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc125) + %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc126) + %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc127) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c4096_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc129) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc129) + %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc130) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x4096xi32> loc(#loc130) + %tmp0 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_18 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc132) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x4096xi32> loc(#loc132) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc133) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc133) + %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc134) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x4096xi1> loc(#loc134) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc135) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc135) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc135) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc136) + %c0_i32_32 = arith.constant 0 : i32 loc(#loc23) + %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23) + %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, i1) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) loc(#loc24) + %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc137) + %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x4096xi1> loc(#loc137) + %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc138) + %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc139) + %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x4096xi1> loc(#loc139) + %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc140) + %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc141) + %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x4096xi1> loc(#loc141) + %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc142) + scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc31) + } loc(#loc207) + %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143) + %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144) + %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc36) + %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc36) + %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36) + %7 = arith.bitcast %c4096_i32_12 : i32 to i32 loc(#loc36) + %8 = ub.poison : i32 loc(#loc36) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc146) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc146) + %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc147) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x4096xi32> loc(#loc147) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc148) + %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc148) + %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149) + %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc149) + %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc149) + %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc149) + %tmp9_20 = arith.extf %tmp9_19 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc150) + %tmp12 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_21 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151) + %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151) + %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc152) + %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x4096xi32> loc(#loc152) + %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc153) + %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc153) + %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc154) + %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x4096xi1> loc(#loc154) + %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc155) + %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc155) + %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc155) + %tmp12_34 = arith.extf %tmp12_33 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc156) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc157) + %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc157) + %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc158) + %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc158) + %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc158) + %tmp23_40 = arith.extf %tmp23_39 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc159) + %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160) + %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc161) + %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x4096xf32> loc(#loc161) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc162) + %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x4096xf32> loc(#loc162) + %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163) + %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164) + %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164) + %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165) + %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166) + %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166) + %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc168) + %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x4096xf32> loc(#loc168) + %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x4096xf32> loc(#loc169) + %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x4096xf32> loc(#loc170) + %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62) + %c4096_i32_47 = arith.constant 4096 : i32 loc(#loc62) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc63) + %11 = arith.addi %r0_index_13, %10 : tensor<1x4096xi32> loc(#loc63) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc64) + %13 = tt.addptr %12, %11 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc64) + %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc65) + %15 = arith.andi %r0_mask_14, %14 : tensor<1x4096xi1> loc(#loc65) + %16 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc66) + tt.store %13, %16, %15 : tensor<1x4096x!tt.ptr> loc(#loc66) + } loc(#loc36) + tt.return loc(#loc67) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() -> tensor<1x4096xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc69) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc69) + tt.return %cst_0 : tensor<1x4096xf32> loc(#loc70) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x4096xf32> loc(#loc71) + tt.return %0 : tensor<1x4096xf32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%new_mean: tensor<1x4096xf32> loc("new_mean"(#loc171)), %mean: tensor<1x4096xf32> loc("mean"(#loc72)), %m2: tensor<1x4096xf32> loc("m2"(#loc72)), %weight: tensor<1x4096xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) attributes {noinline = false} { + %0:3 = scf.if %first_iteration -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) { + %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176) + %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc202) + %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_4096S__(%m2) : (tensor<1x4096xf32>) -> tensor<1x4096xf32> loc(#loc203) + scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc203) + } else { + %delta = arith.subf %new_mean, %mean : tensor<1x4096xf32> loc(#loc178) + %new_weight = arith.constant 1 : i32 loc(#loc179) + %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179) + %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc179) + %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x4096xf32> loc(#loc204) + %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x4096xf32> loc(#loc180) + %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x4096xf32> loc(#loc205) + %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x4096xf32> loc(#loc182) + %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x4096xf32> loc(#loc183) + %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x4096xf32> loc(#loc206) + scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc184) + } loc(#loc73) + tt.return %0#1, %0#0, %0#2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x4096xf32> loc(#loc84) + %2 = ub.poison : tensor<1x4096xf32> loc(#loc84) + %3 = ub.poison : tensor<1x4096xf32> loc(#loc84) + tt.return %1, %2, %3 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc84) + } loc(#loc72) + tt.func private @triton.language.standard.zeros_like__fp32S1_4096S__(%input: tensor<1x4096xf32> loc("input"(#loc85))) -> tensor<1x4096xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc86) + tt.return %0 : tensor<1x4096xf32> loc(#loc87) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x4096xf32> loc(#loc88) + tt.return %1 : tensor<1x4096xf32> loc(#loc88) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%mean: tensor<1x4096xf32> loc("mean"(#loc89)), %m2: tensor<1x4096xf32> loc("m2"(#loc89)), %weight: tensor<1x4096xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({ + ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): + %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90) + tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90) + }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90) + tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc93) + %2 = ub.poison : tensor<1xf32> loc(#loc93) + %3 = ub.poison : tensor<1xf32> loc(#loc93) + tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93) + } loc(#loc89) + tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} { + %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195) + %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196) + %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197) + %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197) + %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198) + %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199) + %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100) + %1 = arith.addf %mean_1, %0 : f32 loc(#loc101) + %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102) + %3 = arith.mulf %delta, %delta : f32 loc(#loc103) + %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104) + %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105) + %6 = arith.addf %2, %5 : f32 loc(#loc106) + tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107) + ^bb1: // no predecessors + %7 = ub.poison : f32 loc(#loc108) + %8 = ub.poison : f32 loc(#loc108) + %9 = ub.poison : f32 loc(#loc108) + tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108) + } loc(#loc94) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":29:45) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":30:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":31:47) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:62) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:51) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:37) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:58) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:41) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:8) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":50:16) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:43) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":52:31) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":53:29) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:47) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:42) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:62) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":60:16) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":64:16) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":66:16) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:41) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:36) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:63) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4) +#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0) +#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31) +#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11) +#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4) +#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46) +#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11) +#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4) +#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30) +#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11) +#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4) +#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11) +#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4) +#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11) +#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4) +#loc115 = loc("xnumel"(#loc1)) +#loc116 = loc("r0_numel"(#loc2)) +#loc117 = loc("xoffset"(#loc3)) +#loc118 = loc("xoffset"(#loc4)) +#loc119 = loc("xindex"(#loc5)) +#loc120 = loc("xindex"(#loc6)) +#loc121 = loc("xindex"(#loc7)) +#loc122 = loc("xmask"(#loc8)) +#loc123 = loc("r0_base"(#loc9)) +#loc124 = loc("r0_base"(#loc10)) +#loc125 = loc("tmp3_mean"(#loc11)) +#loc126 = loc("tmp3_m2"(#loc12)) +#loc127 = loc("tmp3_weight"(#loc13)) +#loc128 = loc("tmp3_mean"(#loc14)) +#loc129 = loc("r0_index"(#loc15)) +#loc130 = loc("r0_mask"(#loc16)) +#loc131 = loc("tmp0"(#loc17)) +#loc132 = loc("tmp0"(#loc18)) +#loc133 = loc("tmp0"(#loc19)) +#loc134 = loc("tmp0"(#loc20)) +#loc135 = loc("tmp0"(#loc21)) +#loc136 = loc("tmp0"(#loc22)) +#loc137 = loc("tmp3_mean"(#loc25)) +#loc138 = loc("tmp3_mean"(#loc26)) +#loc139 = loc("tmp3_m2"(#loc27)) +#loc140 = loc("tmp3_m2"(#loc28)) +#loc141 = loc("tmp3_weight"(#loc29)) +#loc142 = loc("tmp3_weight"(#loc30)) +#loc143 = loc("tmp3"(#loc33)) +#loc144 = loc("tmp7"(#loc34)) +#loc145 = loc("tmp8"(#loc35)) +#loc146 = loc("r0_index"(#loc37)) +#loc147 = loc("r0_mask"(#loc38)) +#loc148 = loc("tmp9"(#loc39)) +#loc149 = loc("tmp9"(#loc40)) +#loc150 = loc("tmp9"(#loc41)) +#loc151 = loc("tmp12"(#loc42)) +#loc152 = loc("tmp12"(#loc43)) +#loc153 = loc("tmp12"(#loc44)) +#loc154 = loc("tmp12"(#loc45)) +#loc155 = loc("tmp12"(#loc46)) +#loc156 = loc("tmp12"(#loc47)) +#loc157 = loc("tmp23"(#loc48)) +#loc158 = loc("tmp23"(#loc49)) +#loc159 = loc("tmp23"(#loc50)) +#loc160 = loc("tmp10"(#loc51)) +#loc161 = loc("tmp11"(#loc52)) +#loc162 = loc("tmp14"(#loc53)) +#loc163 = loc("tmp15"(#loc54)) +#loc164 = loc("tmp16"(#loc55)) +#loc165 = loc("tmp17"(#loc56)) +#loc166 = loc("tmp18"(#loc57)) +#loc167 = loc("tmp19"(#loc58)) +#loc168 = loc("tmp20"(#loc59)) +#loc169 = loc("tmp22"(#loc60)) +#loc170 = loc("tmp24"(#loc61)) +#loc176 = loc("new_weight"(#loc74)) +#loc177 = loc("new_m2"(#loc75)) +#loc178 = loc("delta"(#loc76)) +#loc179 = loc("new_weight"(#loc77)) +#loc180 = loc("new_mean"(#loc78)) +#loc181 = loc("new_mean"(#loc79)) +#loc182 = loc("new_m2"(#loc80)) +#loc183 = loc("new_m2"(#loc81)) +#loc184 = loc("new_m2"(#loc82)) +#loc195 = loc("delta"(#loc95)) +#loc196 = loc("new_weight"(#loc96)) +#loc197 = loc("w2_over_w"(#loc97)) +#loc198 = loc("w2_over_w"(#loc98)) +#loc199 = loc("w2_over_w"(#loc99)) +#loc200 = loc("tmp3_m2"(#loc128)) +#loc202 = loc("new_weight"(#loc176)) +#loc203 = loc("new_m2"(#loc177)) +#loc204 = loc("new_weight"(#loc179)) +#loc205 = loc("new_mean"(#loc181)) +#loc206 = loc("new_m2"(#loc184)) +#loc207 = loc("tmp3_weight"(#loc200)) diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fc93054c228943ac0bd8dec67fa9da45f7c48945 --- /dev/null +++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir @@ -0,0 +1,179 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0) +#loc1 = loc(unknown) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79) +#loc49 = loc("in_ptr0"(#loc)) +#loc50 = loc("in_ptr1"(#loc)) +#loc51 = loc("in_ptr2"(#loc)) +#loc52 = loc("out_ptr2"(#loc)) +#loc53 = loc("xnumel"(#loc)) +#loc54 = loc("r0_numel"(#loc)) +#loc68 = loc(callsite(#loc1 at #loc15)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1x4096xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16, #blocked> loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc55) + %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc56) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc57) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4096xi32, #blocked> loc(#loc57) + %r0_mask = arith.cmpi slt, %r0_base_6, %cst : tensor<1x4096xi32, #blocked> loc(#loc58) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc59) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32, #blocked> loc(#loc92) + %tmp0_8 = arith.addi %r0_base_6, %tmp0_7 : tensor<1x4096xi32, #blocked> loc(#loc60) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc61) + %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc61) + %tmp0_11 = tt.splat %xmask : i1 -> tensor<1x4096xi1, #blocked> loc(#loc93) + %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x4096xi1, #blocked> loc(#loc62) + %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc63) + %tmp0_14 = arith.extf %tmp0_13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc64) + %tmp3_mean = arith.select %tmp0_12, %tmp0_14, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc65) + %tmp3_weight = arith.select %tmp0_12, %cst_5, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc66) + %0:3 = "tt.reduce"(%tmp3_mean, %cst_2, %tmp3_weight) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc15)), %arg7: f32 loc(callsite(#loc1 at #loc15)), %arg8: f32 loc(callsite(#loc1 at #loc15)), %arg9: f32 loc(callsite(#loc1 at #loc15)), %arg10: f32 loc(callsite(#loc1 at #loc15)), %arg11: f32 loc(callsite(#loc1 at #loc15))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc94) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc95) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc96) + %w2_over_w_24 = arith.divf %arg11, %new_weight : f32 loc(#loc97) + %w2_over_w_25 = arith.select %w2_over_w, %cst_1, %w2_over_w_24 : f32 loc(#loc98) + %4 = arith.mulf %delta, %w2_over_w_25 : f32 loc(#loc99) + %5 = arith.addf %arg6, %4 : f32 loc(#loc100) + %6 = arith.addf %arg7, %arg10 : f32 loc(#loc101) + %7 = arith.mulf %delta, %delta : f32 loc(#loc102) + %8 = arith.mulf %7, %arg8 : f32 loc(#loc103) + %9 = arith.mulf %8, %w2_over_w_25 : f32 loc(#loc104) + %10 = arith.addf %6, %9 : f32 loc(#loc105) + tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc67) + }) : (tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc67) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc74) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc75) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc76) + %tmp9_15 = tt.addptr %tmp9, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc76) + %tmp9_16 = tt.load %tmp9_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc77) + %tmp9_17 = arith.extf %tmp9_16 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc78) + %tmp12 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr, #blocked> loc(#loc79) + %tmp12_18 = arith.extf %tmp12 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc80) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc81) + %tmp23_19 = tt.addptr %tmp23, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc81) + %tmp23_20 = tt.load %tmp23_19, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc82) + %tmp23_21 = arith.extf %tmp23_20 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc83) + %tmp11 = arith.addf %tmp9_17, %cst_5 : tensor<1x4096xf32, #blocked> loc(#loc84) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc85) + %tmp14_22 = arith.subf %tmp12_18, %tmp14 : tensor<1x4096xf32, #blocked> loc(#loc85) + %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc86) + %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc87) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc88) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc89) + %tmp20_23 = arith.mulf %tmp14_22, %tmp20 : tensor<1x4096xf32, #blocked> loc(#loc89) + %tmp22 = arith.mulf %tmp11, %tmp20_23 : tensor<1x4096xf32, #blocked> loc(#loc90) + %tmp24 = arith.addf %tmp22, %tmp23_21 : tensor<1x4096xf32, #blocked> loc(#loc91) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc46) + %2 = tt.addptr %1, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc46) + %3 = arith.truncf %tmp24 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc47) + tt.store %2, %3, %tmp0_12 : tensor<1x4096x!tt.ptr, #blocked> loc(#loc47) + tt.return loc(#loc48) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66) +#loc14 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4) +#loc55 = loc("xoffset"(#loc2)) +#loc56 = loc("xmask"(#loc3)) +#loc57 = loc("r0_base"(#loc4)) +#loc58 = loc("r0_mask"(#loc5)) +#loc59 = loc("tmp0"(#loc6)) +#loc60 = loc("tmp0"(#loc7)) +#loc61 = loc("tmp0"(#loc8)) +#loc62 = loc("tmp0"(#loc9)) +#loc63 = loc("tmp0"(#loc10)) +#loc64 = loc("tmp0"(#loc11)) +#loc65 = loc("tmp3_mean"(#loc12)) +#loc66 = loc("tmp3_weight"(#loc13)) +#loc67 = loc(callsite(#loc14 at #loc15)) +#loc69 = loc("delta"(#loc16)) +#loc70 = loc("new_weight"(#loc17)) +#loc71 = loc("w2_over_w"(#loc18)) +#loc72 = loc("w2_over_w"(#loc19)) +#loc73 = loc("w2_over_w"(#loc20)) +#loc74 = loc("tmp3"(#loc28)) +#loc75 = loc("tmp7"(#loc29)) +#loc76 = loc("tmp9"(#loc30)) +#loc77 = loc("tmp9"(#loc31)) +#loc78 = loc("tmp9"(#loc32)) +#loc79 = loc("tmp12"(#loc33)) +#loc80 = loc("tmp12"(#loc34)) +#loc81 = loc("tmp23"(#loc35)) +#loc82 = loc("tmp23"(#loc36)) +#loc83 = loc("tmp23"(#loc37)) +#loc84 = loc("tmp11"(#loc38)) +#loc85 = loc("tmp14"(#loc39)) +#loc86 = loc("tmp16"(#loc40)) +#loc87 = loc("tmp18"(#loc41)) +#loc88 = loc("tmp19"(#loc42)) +#loc89 = loc("tmp20"(#loc43)) +#loc90 = loc("tmp22"(#loc44)) +#loc91 = loc("tmp24"(#loc45)) +#loc92 = loc(fused[#loc60, #loc59]) +#loc93 = loc(fused[#loc62, #loc56]) +#loc94 = loc(callsite(#loc69 at #loc67)) +#loc95 = loc(callsite(#loc70 at #loc67)) +#loc96 = loc(callsite(#loc71 at #loc67)) +#loc97 = loc(callsite(#loc72 at #loc67)) +#loc98 = loc(callsite(#loc73 at #loc67)) +#loc99 = loc(callsite(#loc21 at #loc67)) +#loc100 = loc(callsite(#loc22 at #loc67)) +#loc101 = loc(callsite(#loc23 at #loc67)) +#loc102 = loc(callsite(#loc24 at #loc67)) +#loc103 = loc(callsite(#loc25 at #loc67)) +#loc104 = loc(callsite(#loc26 at #loc67)) +#loc105 = loc(callsite(#loc27 at #loc67)) diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..116200f04887ccf75098d3097f252bdbbc2ea0e5 --- /dev/null +++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttir @@ -0,0 +1,180 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0) +#loc1 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("in_ptr1"(#loc)) +#loc52 = loc("in_ptr2"(#loc)) +#loc53 = loc("out_ptr2"(#loc)) +#loc54 = loc("xnumel"(#loc)) +#loc55 = loc("r0_numel"(#loc)) +#loc57 = loc(callsite(#loc1 at #loc3)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %xmask = arith.constant 2048 : i32 loc(#loc56) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc57) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc1) + %cst_5 = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc58) + %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc56) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc59) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc60) + %r0_mask = arith.cmpi slt, %r0_base_7, %cst_5 : tensor<1x4096xi32> loc(#loc61) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc62) + %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32> loc(#loc94) + %tmp0_9 = arith.addi %r0_base_7, %tmp0_8 : tensor<1x4096xi32> loc(#loc63) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc64) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc64) + %tmp0_12 = tt.splat %xmask_6 : i1 -> tensor<1x4096xi1> loc(#loc95) + %tmp0_13 = arith.andi %r0_mask, %tmp0_12 : tensor<1x4096xi1> loc(#loc65) + %tmp0_14 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc66) + %tmp0_15 = arith.extf %tmp0_14 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc67) + %tmp3_mean = arith.select %tmp0_13, %tmp0_15, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc68) + %tmp3_weight = arith.select %tmp0_13, %cst_4, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc69) + %0:3 = "tt.reduce"(%tmp3_mean, %cst_0, %tmp3_weight) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc3)), %arg7: f32 loc(callsite(#loc1 at #loc3)), %arg8: f32 loc(callsite(#loc1 at #loc3)), %arg9: f32 loc(callsite(#loc1 at #loc3)), %arg10: f32 loc(callsite(#loc1 at #loc3)), %arg11: f32 loc(callsite(#loc1 at #loc3))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc96) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc97) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc98) + %w2_over_w_25 = arith.divf %arg11, %new_weight : f32 loc(#loc99) + %w2_over_w_26 = arith.select %w2_over_w, %cst, %w2_over_w_25 : f32 loc(#loc100) + %4 = arith.mulf %delta, %w2_over_w_26 : f32 loc(#loc101) + %5 = arith.addf %arg6, %4 : f32 loc(#loc102) + %6 = arith.addf %arg7, %arg10 : f32 loc(#loc103) + %7 = arith.mulf %delta, %delta : f32 loc(#loc104) + %8 = arith.mulf %7, %arg8 : f32 loc(#loc105) + %9 = arith.mulf %8, %w2_over_w_26 : f32 loc(#loc106) + %10 = arith.addf %6, %9 : f32 loc(#loc107) + tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc70) + }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc70) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc76) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc77) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc78) + %tmp9_16 = tt.addptr %tmp9, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc78) + %tmp9_17 = tt.load %tmp9_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc79) + %tmp9_18 = arith.extf %tmp9_17 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc80) + %tmp12 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc81) + %tmp12_19 = arith.extf %tmp12 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc82) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc83) + %tmp23_20 = tt.addptr %tmp23, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc83) + %tmp23_21 = tt.load %tmp23_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc84) + %tmp23_22 = arith.extf %tmp23_21 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc85) + %tmp11 = arith.addf %tmp9_18, %cst_4 : tensor<1x4096xf32> loc(#loc86) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc87) + %tmp14_23 = arith.subf %tmp12_19, %tmp14 : tensor<1x4096xf32> loc(#loc87) + %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc88) + %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc89) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc90) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc91) + %tmp20_24 = arith.mulf %tmp14_23, %tmp20 : tensor<1x4096xf32> loc(#loc91) + %tmp22 = arith.mulf %tmp11, %tmp20_24 : tensor<1x4096xf32> loc(#loc92) + %tmp24 = arith.addf %tmp22, %tmp23_22 : tensor<1x4096xf32> loc(#loc93) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc47) + %2 = tt.addptr %1, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc47) + %3 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc48) + tt.store %2, %3, %tmp0_13 : tensor<1x4096x!tt.ptr> loc(#loc48) + tt.return loc(#loc49) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:27) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66) +#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4) +#loc56 = loc("xmask"(#loc2)) +#loc58 = loc("xoffset"(#loc4)) +#loc59 = loc("r0_base"(#loc5)) +#loc60 = loc("r0_base"(#loc6)) +#loc61 = loc("r0_mask"(#loc7)) +#loc62 = loc("tmp0"(#loc8)) +#loc63 = loc("tmp0"(#loc9)) +#loc64 = loc("tmp0"(#loc10)) +#loc65 = loc("tmp0"(#loc11)) +#loc66 = loc("tmp0"(#loc12)) +#loc67 = loc("tmp0"(#loc13)) +#loc68 = loc("tmp3_mean"(#loc14)) +#loc69 = loc("tmp3_weight"(#loc15)) +#loc70 = loc(callsite(#loc16 at #loc3)) +#loc71 = loc("delta"(#loc17)) +#loc72 = loc("new_weight"(#loc18)) +#loc73 = loc("w2_over_w"(#loc19)) +#loc74 = loc("w2_over_w"(#loc20)) +#loc75 = loc("w2_over_w"(#loc21)) +#loc76 = loc("tmp3"(#loc29)) +#loc77 = loc("tmp7"(#loc30)) +#loc78 = loc("tmp9"(#loc31)) +#loc79 = loc("tmp9"(#loc32)) +#loc80 = loc("tmp9"(#loc33)) +#loc81 = loc("tmp12"(#loc34)) +#loc82 = loc("tmp12"(#loc35)) +#loc83 = loc("tmp23"(#loc36)) +#loc84 = loc("tmp23"(#loc37)) +#loc85 = loc("tmp23"(#loc38)) +#loc86 = loc("tmp11"(#loc39)) +#loc87 = loc("tmp14"(#loc40)) +#loc88 = loc("tmp16"(#loc41)) +#loc89 = loc("tmp18"(#loc42)) +#loc90 = loc("tmp19"(#loc43)) +#loc91 = loc("tmp20"(#loc44)) +#loc92 = loc("tmp22"(#loc45)) +#loc93 = loc("tmp24"(#loc46)) +#loc94 = loc(fused[#loc63, #loc62]) +#loc95 = loc(fused[#loc65, #loc56]) +#loc96 = loc(callsite(#loc71 at #loc70)) +#loc97 = loc(callsite(#loc72 at #loc70)) +#loc98 = loc(callsite(#loc73 at #loc70)) +#loc99 = loc(callsite(#loc74 at #loc70)) +#loc100 = loc(callsite(#loc75 at #loc70)) +#loc101 = loc(callsite(#loc22 at #loc70)) +#loc102 = loc(callsite(#loc23 at #loc70)) +#loc103 = loc(callsite(#loc24 at #loc70)) +#loc104 = loc(callsite(#loc25 at #loc70)) +#loc105 = loc(callsite(#loc26 at #loc70)) +#loc106 = loc(callsite(#loc27 at #loc70)) +#loc107 = loc(callsite(#loc28 at #loc70)) diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/__grp__triton_poi_fused_add_mul_0.json b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/__grp__triton_poi_fused_add_mul_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a8335465cb2ce94c109ae0e4260cd48a229a95bb --- /dev/null +++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/__grp__triton_poi_fused_add_mul_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_mul_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.source", "triton_poi_fused_add_mul_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttir", "triton_poi_fused_add_mul_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttgir", "triton_poi_fused_add_mul_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.llir", "triton_poi_fused_add_mul_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ptx", "triton_poi_fused_add_mul_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.cubin", "triton_poi_fused_add_mul_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.json"}} \ No newline at end of file diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.cubin b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..8f551c4582f5275c698f20ca806539089233795d Binary files /dev/null and b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.cubin differ diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.json b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.json new file mode 100644 index 0000000000000000000000000000000000000000..39dc7b1046387eaab01748aafa5a3a7c2a523067 --- /dev/null +++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.json @@ -0,0 +1 @@ +{"hash": "6b57635de194e2a76433f0e27ed2e1fb661a4b5321d0763577e3e7d1de685256", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_0"} \ No newline at end of file diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.llir b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..060110a98452b589ca594dd7166488cd264d2461 --- /dev/null +++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.llir @@ -0,0 +1,76 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_add_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 9, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = shl nuw nsw i32 %10, 1, !dbg !9 + %12 = and i32 %11, 510, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = srem i32 %13, 4096, !dbg !11 + %15 = sext i32 %13 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12 + %17 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %16) #2, !dbg !13 + %18 = bitcast i32 %17 to <2 x bfloat>, !dbg !13 + %19 = sext i32 %14 to i64, !dbg !14 + %20 = getelementptr bfloat, ptr addrspace(1) %1, i64 %19, !dbg !14 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15 + %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l"(ptr addrspace(1) %20, i64 %21) #2, !dbg !15 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !15 + %24 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16 + %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %24) #2, !dbg !17 + %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17 + %27 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18 + %28 = fpext <2 x bfloat> %18 to <2 x float>, !dbg !19 + %29 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !20 + %30 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !21 + %31 = fmul <2 x float> %29, %30, !dbg !22 + %32 = fadd <2 x float> %31, %28, !dbg !23 + %33 = fptrunc <2 x float> %32 to <2 x bfloat>, !dbg !24 + %34 = bitcast <2 x bfloat> %33 to i32, !dbg !24 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %34, ptr addrspace(1) %27) #2, !dbg !24 + ret void, !dbg !25 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_0", linkageName: "triton_poi_fused_add_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 19, scope: !4) +!12 = !DILocation(line: 25, column: 30, scope: !4) +!13 = !DILocation(line: 25, column: 35, scope: !4) +!14 = !DILocation(line: 26, column: 30, scope: !4) +!15 = !DILocation(line: 26, column: 35, scope: !4) +!16 = !DILocation(line: 27, column: 30, scope: !4) +!17 = !DILocation(line: 27, column: 35, scope: !4) +!18 = !DILocation(line: 30, column: 25, scope: !4) +!19 = !DILocation(line: 25, column: 44, scope: !4) +!20 = !DILocation(line: 26, column: 74, scope: !4) +!21 = !DILocation(line: 27, column: 44, scope: !4) +!22 = !DILocation(line: 28, column: 18, scope: !4) +!23 = !DILocation(line: 29, column: 18, scope: !4) +!24 = !DILocation(line: 30, column: 36, scope: !4) +!25 = !DILocation(line: 30, column: 4, scope: !4) diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ptx b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..df4549b1249f2be8c811689b87560410aeaa6c14 --- /dev/null +++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ptx @@ -0,0 +1,347 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_add_mul_0 // -- Begin function triton_poi_fused_add_mul_0 + // @triton_poi_fused_add_mul_0 +.visible .entry triton_poi_fused_add_mul_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_3, + .param .u32 triton_poi_fused_add_mul_0_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_6 +) +.reqntid 256 +{ + .reg .b16 %rs<7>; + .reg .b32 %r<24>; + .reg .b64 %rd<11>; + .loc 1 18 0 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:18:0 + +// %bb.0: + ld.param.b64 %rd6, [triton_poi_fused_add_mul_0_param_0]; + ld.param.b64 %rd7, [triton_poi_fused_add_mul_0_param_1]; +$L__tmp0: + .loc 1 20 28 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:20:28 + mov.u32 %r5, %ctaid.x; + .loc 1 20 33 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:20:33 + shl.b32 %r6, %r5, 9; + ld.param.b64 %rd8, [triton_poi_fused_add_mul_0_param_2]; + ld.param.b64 %rd9, [triton_poi_fused_add_mul_0_param_3]; + .loc 1 21 36 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:21:36 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 1; + and.b32 %r9, %r8, 510; + .loc 1 21 23 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:21:23 + or.b32 %r10, %r9, %r6; + .loc 1 24 19 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:24:19 + bfe.s32 %r11, %r5, 22, 1; + shr.u32 %r12, %r11, 20; + add.s32 %r13, %r10, %r12; + and.b32 %r14, %r13, -4096; + sub.s32 %r15, %r10, %r14; + .loc 1 25 30 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:30 + mul.wide.s32 %rd10, %r10, 2; + add.s64 %rd1, %rd6, %rd10; + .loc 1 25 35 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:35 + // begin inline asm + mov.u32 %r1, 0x0; + ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 26 30 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:30 + mad.wide.s32 %rd2, %r15, 2, %rd7; + .loc 1 26 35 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:35 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + ld.global.L1::evict_last.L2::cache_hint.b32 { %r2 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 27 30 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:30 + add.s64 %rd4, %rd8, %rd10; + .loc 1 27 35 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:35 + // begin inline asm + mov.u32 %r3, 0x0; + ld.global.b32 { %r3 }, [ %rd4 + 0 ]; + // end inline asm + .loc 1 30 25 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:25 + add.s64 %rd5, %rd9, %rd10; + .loc 1 25 44 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:44 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r16, %rs2; + cvt.f32.bf16 %r17, %rs1; + .loc 1 26 74 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:74 + mov.b32 {%rs3, %rs4}, %r2; + cvt.f32.bf16 %r18, %rs4; + cvt.f32.bf16 %r19, %rs3; + .loc 1 27 44 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:44 + mov.b32 {%rs5, %rs6}, %r3; + cvt.f32.bf16 %r20, %rs6; + cvt.f32.bf16 %r21, %rs5; + .loc 1 29 18 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:29:18 + fma.rn.f32 %r22, %r19, %r21, %r17; + fma.rn.f32 %r23, %r18, %r20, %r16; + .loc 1 30 36 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:36 + cvt.rn.bf16x2.f32 %r4, %r23, %r22; + // begin inline asm + st.global.b32 [ %rd5 + 0 ], { %r4 }; + // end inline asm + .loc 1 30 4 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 107 +.b8 97 +.b8 116 +.b8 53 +.b8 103 +.b8 55 +.b8 110 +.b8 51 +.b8 117 +.b8 117 +.b8 107 +.b8 107 +.b8 102 +.b8 119 +.b8 103 +.b8 100 +.b8 120 +.b8 102 +.b8 119 +.b8 116 +.b8 109 +.b8 120 +.b8 98 +.b8 108 +.b8 99 +.b8 109 +.b8 113 +.b8 122 +.b8 104 +.b8 98 +.b8 105 +.b8 102 +.b8 111 +.b8 53 +.b8 103 +.b8 51 +.b8 114 +.b8 98 +.b8 97 +.b8 122 +.b8 51 +.b8 100 +.b8 106 +.b8 120 +.b8 105 +.b8 105 +.b8 51 +.b8 53 +.b8 103 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 54 +.b8 107 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.source b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.source new file mode 100644 index 0000000000000000000000000000000000000000..83ec1aaf83aa222188a01634f22622994cf15787 --- /dev/null +++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.source @@ -0,0 +1,82 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0) +#loc22 = loc("in_ptr0"(#loc)) +#loc23 = loc("in_ptr1"(#loc)) +#loc24 = loc("in_ptr2"(#loc)) +#loc25 = loc("out_ptr0"(#loc)) +#loc26 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 1048576 : i32 loc(#loc27) + %xoffset = tt.get_program_id x : i32 loc(#loc28) + %xoffset_1 = arith.constant 512 : i32 loc(#loc29) + %xoffset_2 = arith.constant 512 : i32 loc(#loc29) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc30) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc31) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc31) + %xmask = arith.constant true loc(#loc32) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc32) + %x0 = arith.constant 4096 : i32 loc(#loc33) + %x0_7 = arith.constant 4096 : i32 loc(#loc33) + %x0_8 = arith.constant dense<4096> : tensor<512xi32> loc(#loc33) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc33) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc34) + %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc34) + %tmp0_11 = tt.load %tmp0_10 : tensor<512x!tt.ptr> loc(#loc35) + %tmp0_12 = arith.extf %tmp0_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc36) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc37) + %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc37) + %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc38) + %tmp1_15 = arith.extf %tmp1_14 : tensor<512xbf16> to tensor<512xf32> loc(#loc39) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc40) + %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc40) + %tmp2_17 = tt.load %tmp2_16 : tensor<512x!tt.ptr> loc(#loc41) + %tmp2_18 = arith.extf %tmp2_17 : tensor<512xbf16> to tensor<512xf32> loc(#loc42) + %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<512xf32> loc(#loc43) + %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<512xf32> loc(#loc44) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc19) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc19) + %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc20) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc20) + tt.return loc(#loc21) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4) +#loc27 = loc("xnumel"(#loc1)) +#loc28 = loc("xoffset"(#loc2)) +#loc29 = loc("xoffset"(#loc3)) +#loc30 = loc("xindex"(#loc4)) +#loc31 = loc("xindex"(#loc5)) +#loc32 = loc("xmask"(#loc6)) +#loc33 = loc("x0"(#loc7)) +#loc34 = loc("tmp0"(#loc8)) +#loc35 = loc("tmp0"(#loc9)) +#loc36 = loc("tmp0"(#loc10)) +#loc37 = loc("tmp1"(#loc11)) +#loc38 = loc("tmp1"(#loc12)) +#loc39 = loc("tmp1"(#loc13)) +#loc40 = loc("tmp2"(#loc14)) +#loc41 = loc("tmp2"(#loc15)) +#loc42 = loc("tmp2"(#loc16)) +#loc43 = loc("tmp3"(#loc17)) +#loc44 = loc("tmp4"(#loc18)) diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttgir b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..7c77cf8c43c9fbdf5d6c39b01549c5cde98b7982 --- /dev/null +++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttgir @@ -0,0 +1,74 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc26) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc27) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc28) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32, #blocked> loc(#loc29) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32, #blocked> loc(#loc29) + %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32, #blocked> loc(#loc30) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc31) + %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc31) + %tmp0_4 = tt.load %tmp0_3 : tensor<512x!tt.ptr, #blocked> loc(#loc32) + %tmp0_5 = arith.extf %tmp0_4 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc34) + %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc34) + %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc35) + %tmp1_8 = arith.extf %tmp1_7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc37) + %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc37) + %tmp2_10 = tt.load %tmp2_9 : tensor<512x!tt.ptr, #blocked> loc(#loc38) + %tmp2_11 = arith.extf %tmp2_10 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc39) + %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<512xf32, #blocked> loc(#loc40) + %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<512xf32, #blocked> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc19) + tt.store %1, %2 : tensor<512x!tt.ptr, #blocked> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4) +#loc26 = loc("xoffset"(#loc2)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xindex"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("x0"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttir b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..b13493bfbfcfabd04700d2852f8f814510f685f1 --- /dev/null +++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttir @@ -0,0 +1,73 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %x0 = arith.constant dense<4096> : tensor<512xi32> loc(#loc26) + %c512_i32 = arith.constant 512 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc27) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc28) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc29) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc30) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc30) + %x0_3 = arith.remsi %xindex_2, %x0 : tensor<512xi32> loc(#loc26) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc31) + %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc31) + %tmp0_5 = tt.load %tmp0_4 : tensor<512x!tt.ptr> loc(#loc32) + %tmp0_6 = arith.extf %tmp0_5 : tensor<512xbf16> to tensor<512xf32> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc34) + %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc34) + %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc35) + %tmp1_9 = arith.extf %tmp1_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc37) + %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc37) + %tmp2_11 = tt.load %tmp2_10 : tensor<512x!tt.ptr> loc(#loc38) + %tmp2_12 = arith.extf %tmp2_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc39) + %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<512xf32> loc(#loc40) + %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<512xf32> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc19) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19) +#loc2 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4) +#loc26 = loc("x0"(#loc1)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xoffset"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("xindex"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f9dc9eacbb50731836276a9b4751eb3a04d9dc03 --- /dev/null +++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.json"}} \ No newline at end of file diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..305f1ecb16764c64362fcd51b4c346bfda222f67 Binary files /dev/null and b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.cubin differ diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8830bb8f9a0cab2edc0084648e22db1d9b3c86a4 --- /dev/null +++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"hash": "73d0b8e91c14724517b217e70af3fd3304d1dbbfda2257ca527f1a36f528beab", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"} \ No newline at end of file diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..d6d36e19ad7396b6ae25fd22889607dd8ddb9254 --- /dev/null +++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.llir @@ -0,0 +1,601 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { +__nv_rsqrtf.exit: + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %12 = icmp samesign ult i32 %11, 256, !dbg !9 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %14 = and i32 %13, 511, !dbg !10 + %15 = and i32 %13, 31, !dbg !10 + %16 = lshr i32 %14, 5, !dbg !10 + %17 = shl nuw nsw i32 %13, 3, !dbg !10 + %18 = and i32 %17, 4088, !dbg !10 + %19 = shl i32 %11, 12, !dbg !11 + %20 = or disjoint i32 %18, %19, !dbg !12 + %21 = sext i32 %20 to i64, !dbg !13 + %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !13 + %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %22, i64 %23, i1 %12) #6, !dbg !14 + %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !14 + %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !14 + %27 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !14 + %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !14 + %29 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !14 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !14 + %31 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !14 + %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !14 + %33 = zext nneg i32 %18 to i64, !dbg !15 + %34 = getelementptr bfloat, ptr addrspace(1) %1, i64 %33, !dbg !15 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %34, i64 %35, i1 true) #6, !dbg !16 + %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !16 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !16 + %39 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !16 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !16 + %41 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !16 + %42 = bitcast i32 %41 to <2 x bfloat>, !dbg !16 + %43 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !16 + %44 = bitcast i32 %43 to <2 x bfloat>, !dbg !16 + %45 = getelementptr bfloat, ptr addrspace(1) %2, i64 %21, !dbg !17 + %46 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !18 + %47 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %45, i64 %46, i1 %12) #6, !dbg !18 + %48 = extractvalue { i32, i32, i32, i32 } %47, 0, !dbg !18 + %49 = bitcast i32 %48 to <2 x bfloat>, !dbg !18 + %50 = extractvalue { i32, i32, i32, i32 } %47, 1, !dbg !18 + %51 = bitcast i32 %50 to <2 x bfloat>, !dbg !18 + %52 = extractvalue { i32, i32, i32, i32 } %47, 2, !dbg !18 + %53 = bitcast i32 %52 to <2 x bfloat>, !dbg !18 + %54 = extractvalue { i32, i32, i32, i32 } %47, 3, !dbg !18 + %55 = bitcast i32 %54 to <2 x bfloat>, !dbg !18 + %56 = select i1 %12, float 1.000000e+00, float 0.000000e+00, !dbg !19 + %57 = getelementptr bfloat, ptr addrspace(1) %5, i64 %21, !dbg !20 + %58 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !21 + %59 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !22 + %60 = fpext <2 x bfloat> %49 to <2 x float>, !dbg !23 + %61 = fmul <2 x float> %59, %60, !dbg !24 + %62 = fadd <2 x float> %61, %58, !dbg !25 + %63 = extractelement <2 x float> %62, i64 0, !dbg !26 + %64 = select i1 %12, float %63, float 0.000000e+00, !dbg !26 + %65 = extractelement <2 x float> %62, i64 1, !dbg !26 + %66 = select i1 %12, float %65, float 0.000000e+00, !dbg !26 + %67 = fptrunc <2 x float> %62 to <2 x bfloat>, !dbg !27 + %68 = fpext <2 x bfloat> %28 to <2 x float>, !dbg !21 + %69 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !22 + %70 = fpext <2 x bfloat> %51 to <2 x float>, !dbg !23 + %71 = fmul <2 x float> %69, %70, !dbg !24 + %72 = fadd <2 x float> %71, %68, !dbg !25 + %73 = extractelement <2 x float> %72, i64 0, !dbg !26 + %74 = select i1 %12, float %73, float 0.000000e+00, !dbg !26 + %75 = extractelement <2 x float> %72, i64 1, !dbg !26 + %76 = select i1 %12, float %75, float 0.000000e+00, !dbg !26 + %77 = fptrunc <2 x float> %72 to <2 x bfloat>, !dbg !27 + %78 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !21 + %79 = fpext <2 x bfloat> %42 to <2 x float>, !dbg !22 + %80 = fpext <2 x bfloat> %53 to <2 x float>, !dbg !23 + %81 = fmul <2 x float> %79, %80, !dbg !24 + %82 = fadd <2 x float> %81, %78, !dbg !25 + %83 = extractelement <2 x float> %82, i64 0, !dbg !26 + %84 = select i1 %12, float %83, float 0.000000e+00, !dbg !26 + %85 = extractelement <2 x float> %82, i64 1, !dbg !26 + %86 = select i1 %12, float %85, float 0.000000e+00, !dbg !26 + %87 = fptrunc <2 x float> %82 to <2 x bfloat>, !dbg !27 + %88 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !21 + %89 = fpext <2 x bfloat> %44 to <2 x float>, !dbg !22 + %90 = fpext <2 x bfloat> %55 to <2 x float>, !dbg !23 + %91 = fmul <2 x float> %89, %90, !dbg !24 + %92 = fadd <2 x float> %91, %88, !dbg !25 + %93 = extractelement <2 x float> %92, i64 0, !dbg !26 + %94 = select i1 %12, float %93, float 0.000000e+00, !dbg !26 + %95 = extractelement <2 x float> %92, i64 1, !dbg !26 + %96 = select i1 %12, float %95, float 0.000000e+00, !dbg !26 + %97 = fptrunc <2 x float> %92 to <2 x bfloat>, !dbg !27 + %98 = bitcast <2 x bfloat> %67 to i32, !dbg !27 + %99 = bitcast <2 x bfloat> %77 to i32, !dbg !27 + %100 = bitcast <2 x bfloat> %87 to i32, !dbg !27 + %101 = bitcast <2 x bfloat> %97 to i32, !dbg !27 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %98, i32 %99, i32 %100, i32 %101, ptr addrspace(1) %57, i1 %12) #6, !dbg !27 + %102 = fsub float %66, %64, !dbg !28 + %103 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !34 + %104 = fcmp oeq float %103, 0.000000e+00, !dbg !35 + %105 = tail call float @llvm.nvvm.div.full(float %56, float %103), !dbg !36 + %106 = select i1 %104, float 0.000000e+00, float %105, !dbg !37 + %107 = fmul float %106, %102, !dbg !38 + %108 = fadd float %64, %107, !dbg !39 + %109 = fmul float %102, %102, !dbg !40 + %110 = fmul float %56, %109, !dbg !41 + %111 = fmul float %106, %110, !dbg !42 + %112 = fadd float %111, 0.000000e+00, !dbg !43 + %113 = fsub float %74, %108, !dbg !28 + %114 = select i1 %12, float 3.000000e+00, float 0.000000e+00, !dbg !34 + %115 = fcmp oeq float %114, 0.000000e+00, !dbg !35 + %116 = tail call float @llvm.nvvm.div.full(float %56, float %114), !dbg !36 + %117 = select i1 %115, float 0.000000e+00, float %116, !dbg !37 + %118 = fmul float %117, %113, !dbg !38 + %119 = fadd float %108, %118, !dbg !39 + %120 = fmul float %113, %113, !dbg !40 + %121 = fmul float %103, %120, !dbg !41 + %122 = fmul float %117, %121, !dbg !42 + %123 = fadd float %112, %122, !dbg !43 + %124 = fsub float %76, %119, !dbg !28 + %125 = select i1 %12, float 4.000000e+00, float 0.000000e+00, !dbg !34 + %126 = fcmp oeq float %125, 0.000000e+00, !dbg !35 + %127 = tail call float @llvm.nvvm.div.full(float %56, float %125), !dbg !36 + %128 = select i1 %126, float 0.000000e+00, float %127, !dbg !37 + %129 = fmul float %128, %124, !dbg !38 + %130 = fadd float %119, %129, !dbg !39 + %131 = fmul float %124, %124, !dbg !40 + %132 = fmul float %114, %131, !dbg !41 + %133 = fmul float %128, %132, !dbg !42 + %134 = fadd float %123, %133, !dbg !43 + %135 = fsub float %84, %130, !dbg !28 + %136 = select i1 %12, float 5.000000e+00, float 0.000000e+00, !dbg !34 + %137 = fcmp oeq float %136, 0.000000e+00, !dbg !35 + %138 = tail call float @llvm.nvvm.div.full(float %56, float %136), !dbg !36 + %139 = select i1 %137, float 0.000000e+00, float %138, !dbg !37 + %140 = fmul float %139, %135, !dbg !38 + %141 = fadd float %130, %140, !dbg !39 + %142 = fmul float %135, %135, !dbg !40 + %143 = fmul float %125, %142, !dbg !41 + %144 = fmul float %139, %143, !dbg !42 + %145 = fadd float %134, %144, !dbg !43 + %146 = fsub float %86, %141, !dbg !28 + %147 = select i1 %12, float 6.000000e+00, float 0.000000e+00, !dbg !34 + %148 = fcmp oeq float %147, 0.000000e+00, !dbg !35 + %149 = tail call float @llvm.nvvm.div.full(float %56, float %147), !dbg !36 + %150 = select i1 %148, float 0.000000e+00, float %149, !dbg !37 + %151 = fmul float %150, %146, !dbg !38 + %152 = fadd float %141, %151, !dbg !39 + %153 = fmul float %146, %146, !dbg !40 + %154 = fmul float %136, %153, !dbg !41 + %155 = fmul float %150, %154, !dbg !42 + %156 = fadd float %145, %155, !dbg !43 + %157 = fsub float %94, %152, !dbg !28 + %158 = select i1 %12, float 7.000000e+00, float 0.000000e+00, !dbg !34 + %159 = fcmp oeq float %158, 0.000000e+00, !dbg !35 + %160 = tail call float @llvm.nvvm.div.full(float %56, float %158), !dbg !36 + %161 = select i1 %159, float 0.000000e+00, float %160, !dbg !37 + %162 = fmul float %161, %157, !dbg !38 + %163 = fadd float %152, %162, !dbg !39 + %164 = fmul float %157, %157, !dbg !40 + %165 = fmul float %147, %164, !dbg !41 + %166 = fmul float %161, %165, !dbg !42 + %167 = fadd float %156, %166, !dbg !43 + %168 = fsub float %96, %163, !dbg !28 + %169 = select i1 %12, float 8.000000e+00, float 0.000000e+00, !dbg !34 + %170 = fcmp oeq float %169, 0.000000e+00, !dbg !35 + %171 = tail call float @llvm.nvvm.div.full(float %56, float %169), !dbg !36 + %172 = select i1 %170, float 0.000000e+00, float %171, !dbg !37 + %173 = fmul float %172, %168, !dbg !38 + %174 = fadd float %163, %173, !dbg !39 + %175 = fmul float %168, %168, !dbg !40 + %176 = fmul float %158, %175, !dbg !41 + %177 = fmul float %172, %176, !dbg !42 + %178 = fadd float %167, %177, !dbg !43 + %179 = bitcast float %174 to i32, !dbg !31 + %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 16, i32 31), !dbg !31 + %181 = bitcast i32 %180 to float, !dbg !31 + %182 = bitcast float %178 to i32, !dbg !31 + %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 16, i32 31), !dbg !31 + %184 = bitcast i32 %183 to float, !dbg !31 + %185 = bitcast float %169 to i32, !dbg !31 + %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 16, i32 31), !dbg !31 + %187 = bitcast i32 %186 to float, !dbg !31 + %188 = fsub float %181, %174, !dbg !28 + %189 = fadd float %169, %187, !dbg !34 + %190 = fcmp oeq float %189, 0.000000e+00, !dbg !35 + %191 = tail call float @llvm.nvvm.div.full(float %187, float %189), !dbg !36 + %192 = select i1 %190, float 0.000000e+00, float %191, !dbg !37 + %193 = fmul float %192, %188, !dbg !38 + %194 = fadd float %174, %193, !dbg !39 + %195 = fadd float %178, %184, !dbg !44 + %196 = fmul float %188, %188, !dbg !40 + %197 = fmul float %169, %196, !dbg !41 + %198 = fmul float %192, %197, !dbg !42 + %199 = fadd float %195, %198, !dbg !43 + %200 = bitcast float %194 to i32, !dbg !31 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 8, i32 31), !dbg !31 + %202 = bitcast i32 %201 to float, !dbg !31 + %203 = bitcast float %199 to i32, !dbg !31 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 8, i32 31), !dbg !31 + %205 = bitcast i32 %204 to float, !dbg !31 + %206 = bitcast float %189 to i32, !dbg !31 + %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 8, i32 31), !dbg !31 + %208 = bitcast i32 %207 to float, !dbg !31 + %209 = fsub float %202, %194, !dbg !28 + %210 = fadd float %189, %208, !dbg !34 + %211 = fcmp oeq float %210, 0.000000e+00, !dbg !35 + %212 = tail call float @llvm.nvvm.div.full(float %208, float %210), !dbg !36 + %213 = select i1 %211, float 0.000000e+00, float %212, !dbg !37 + %214 = fmul float %213, %209, !dbg !38 + %215 = fadd float %194, %214, !dbg !39 + %216 = fadd float %199, %205, !dbg !44 + %217 = fmul float %209, %209, !dbg !40 + %218 = fmul float %189, %217, !dbg !41 + %219 = fmul float %213, %218, !dbg !42 + %220 = fadd float %216, %219, !dbg !43 + %221 = bitcast float %215 to i32, !dbg !31 + %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 4, i32 31), !dbg !31 + %223 = bitcast i32 %222 to float, !dbg !31 + %224 = bitcast float %220 to i32, !dbg !31 + %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 4, i32 31), !dbg !31 + %226 = bitcast i32 %225 to float, !dbg !31 + %227 = bitcast float %210 to i32, !dbg !31 + %228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 4, i32 31), !dbg !31 + %229 = bitcast i32 %228 to float, !dbg !31 + %230 = fsub float %223, %215, !dbg !28 + %231 = fadd float %210, %229, !dbg !34 + %232 = fcmp oeq float %231, 0.000000e+00, !dbg !35 + %233 = tail call float @llvm.nvvm.div.full(float %229, float %231), !dbg !36 + %234 = select i1 %232, float 0.000000e+00, float %233, !dbg !37 + %235 = fmul float %234, %230, !dbg !38 + %236 = fadd float %215, %235, !dbg !39 + %237 = fadd float %220, %226, !dbg !44 + %238 = fmul float %230, %230, !dbg !40 + %239 = fmul float %210, %238, !dbg !41 + %240 = fmul float %234, %239, !dbg !42 + %241 = fadd float %237, %240, !dbg !43 + %242 = bitcast float %236 to i32, !dbg !31 + %243 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %242, i32 2, i32 31), !dbg !31 + %244 = bitcast i32 %243 to float, !dbg !31 + %245 = bitcast float %241 to i32, !dbg !31 + %246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 2, i32 31), !dbg !31 + %247 = bitcast i32 %246 to float, !dbg !31 + %248 = bitcast float %231 to i32, !dbg !31 + %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 2, i32 31), !dbg !31 + %250 = bitcast i32 %249 to float, !dbg !31 + %251 = fsub float %244, %236, !dbg !28 + %252 = fadd float %231, %250, !dbg !34 + %253 = fcmp oeq float %252, 0.000000e+00, !dbg !35 + %254 = tail call float @llvm.nvvm.div.full(float %250, float %252), !dbg !36 + %255 = select i1 %253, float 0.000000e+00, float %254, !dbg !37 + %256 = fmul float %255, %251, !dbg !38 + %257 = fadd float %236, %256, !dbg !39 + %258 = fadd float %241, %247, !dbg !44 + %259 = fmul float %251, %251, !dbg !40 + %260 = fmul float %231, %259, !dbg !41 + %261 = fmul float %255, %260, !dbg !42 + %262 = fadd float %258, %261, !dbg !43 + %263 = bitcast float %257 to i32, !dbg !31 + %264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 1, i32 31), !dbg !31 + %265 = bitcast i32 %264 to float, !dbg !31 + %266 = bitcast float %262 to i32, !dbg !31 + %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 1, i32 31), !dbg !31 + %268 = bitcast i32 %267 to float, !dbg !31 + %269 = bitcast float %252 to i32, !dbg !31 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 1, i32 31), !dbg !31 + %271 = bitcast i32 %270 to float, !dbg !31 + %272 = fsub float %265, %257, !dbg !28 + %273 = fadd float %252, %271, !dbg !34 + %274 = fcmp oeq float %273, 0.000000e+00, !dbg !35 + %275 = tail call float @llvm.nvvm.div.full(float %271, float %273), !dbg !36 + %276 = select i1 %274, float 0.000000e+00, float %275, !dbg !37 + %277 = fmul float %276, %272, !dbg !38 + %278 = fadd float %257, %277, !dbg !39 + %279 = fadd float %262, %268, !dbg !44 + %280 = fmul float %272, %272, !dbg !40 + %281 = fmul float %252, %280, !dbg !41 + %282 = fmul float %276, %281, !dbg !42 + %283 = fadd float %279, %282, !dbg !43 + %284 = icmp eq i32 %15, 0, !dbg !31 + %285 = getelementptr float, ptr addrspace(3) @global_smem, i32 %16, !dbg !31 + %286 = bitcast float %278 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %285, <1 x i32> %286, i1 %284) #6, !dbg !31 + %287 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %16, !dbg !31 + %288 = bitcast float %283 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %287, <1 x i32> %288, i1 %284) #6, !dbg !31 + %289 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %16, !dbg !31 + %290 = bitcast float %273 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %289, <1 x i32> %290, i1 %284) #6, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31 + %291 = icmp samesign ult i32 %14, 16, !dbg !31 + %292 = getelementptr float, ptr addrspace(3) @global_smem, i32 %14, !dbg !31 + %293 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %292, i1 %291) #6, !dbg !31 + %294 = bitcast i32 %293 to float, !dbg !31 + %295 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %14, !dbg !31 + %296 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %295, i1 %291) #6, !dbg !31 + %297 = bitcast i32 %296 to float, !dbg !31 + %298 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %14, !dbg !31 + %299 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %298, i1 %291) #6, !dbg !31 + %300 = bitcast i32 %299 to float, !dbg !31 + %301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 8, i32 31), !dbg !31 + %302 = bitcast i32 %301 to float, !dbg !31 + %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 8, i32 31), !dbg !31 + %304 = bitcast i32 %303 to float, !dbg !31 + %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 8, i32 31), !dbg !31 + %306 = bitcast i32 %305 to float, !dbg !31 + %307 = fsub float %302, %294, !dbg !28 + %308 = fadd float %300, %306, !dbg !34 + %309 = fcmp oeq float %308, 0.000000e+00, !dbg !35 + %310 = tail call float @llvm.nvvm.div.full(float %306, float %308), !dbg !36 + %311 = select i1 %309, float 0.000000e+00, float %310, !dbg !37 + %312 = fmul float %307, %311, !dbg !38 + %313 = fadd float %312, %294, !dbg !39 + %314 = fadd float %297, %304, !dbg !44 + %315 = fmul float %307, %307, !dbg !40 + %316 = fmul float %315, %300, !dbg !41 + %317 = fmul float %316, %311, !dbg !42 + %318 = fadd float %314, %317, !dbg !43 + %319 = bitcast float %313 to i32, !dbg !31 + %320 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %319, i32 4, i32 31), !dbg !31 + %321 = bitcast i32 %320 to float, !dbg !31 + %322 = bitcast float %318 to i32, !dbg !31 + %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %322, i32 4, i32 31), !dbg !31 + %324 = bitcast i32 %323 to float, !dbg !31 + %325 = bitcast float %308 to i32, !dbg !31 + %326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 4, i32 31), !dbg !31 + %327 = bitcast i32 %326 to float, !dbg !31 + %328 = fsub float %321, %313, !dbg !28 + %329 = fadd float %308, %327, !dbg !34 + %330 = fcmp oeq float %329, 0.000000e+00, !dbg !35 + %331 = tail call float @llvm.nvvm.div.full(float %327, float %329), !dbg !36 + %332 = select i1 %330, float 0.000000e+00, float %331, !dbg !37 + %333 = fmul float %328, %332, !dbg !38 + %334 = fadd float %313, %333, !dbg !39 + %335 = fadd float %318, %324, !dbg !44 + %336 = fmul float %328, %328, !dbg !40 + %337 = fmul float %308, %336, !dbg !41 + %338 = fmul float %332, %337, !dbg !42 + %339 = fadd float %335, %338, !dbg !43 + %340 = bitcast float %334 to i32, !dbg !31 + %341 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %340, i32 2, i32 31), !dbg !31 + %342 = bitcast i32 %341 to float, !dbg !31 + %343 = bitcast float %339 to i32, !dbg !31 + %344 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %343, i32 2, i32 31), !dbg !31 + %345 = bitcast i32 %344 to float, !dbg !31 + %346 = bitcast float %329 to i32, !dbg !31 + %347 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %346, i32 2, i32 31), !dbg !31 + %348 = bitcast i32 %347 to float, !dbg !31 + %349 = fsub float %342, %334, !dbg !28 + %350 = fadd float %329, %348, !dbg !34 + %351 = fcmp oeq float %350, 0.000000e+00, !dbg !35 + %352 = tail call float @llvm.nvvm.div.full(float %348, float %350), !dbg !36 + %353 = select i1 %351, float 0.000000e+00, float %352, !dbg !37 + %354 = fmul float %349, %353, !dbg !38 + %355 = fadd float %334, %354, !dbg !39 + %356 = fadd float %339, %345, !dbg !44 + %357 = fmul float %349, %349, !dbg !40 + %358 = fmul float %329, %357, !dbg !41 + %359 = fmul float %353, %358, !dbg !42 + %360 = fadd float %356, %359, !dbg !43 + %361 = bitcast float %355 to i32, !dbg !31 + %362 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %361, i32 1, i32 31), !dbg !31 + %363 = bitcast i32 %362 to float, !dbg !31 + %364 = bitcast float %360 to i32, !dbg !31 + %365 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %364, i32 1, i32 31), !dbg !31 + %366 = bitcast i32 %365 to float, !dbg !31 + %367 = bitcast float %350 to i32, !dbg !31 + %368 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %367, i32 1, i32 31), !dbg !31 + %369 = bitcast i32 %368 to float, !dbg !31 + %370 = fsub float %363, %355, !dbg !28 + %371 = fadd float %350, %369, !dbg !34 + %372 = fcmp oeq float %371, 0.000000e+00, !dbg !35 + %373 = tail call float @llvm.nvvm.div.full(float %369, float %371), !dbg !36 + %374 = select i1 %372, float 0.000000e+00, float %373, !dbg !37 + %375 = fmul float %370, %374, !dbg !38 + %376 = fadd float %355, %375, !dbg !39 + %377 = fadd float %360, %366, !dbg !44 + %378 = fmul float %370, %370, !dbg !40 + %379 = fmul float %350, %378, !dbg !41 + %380 = fmul float %374, %379, !dbg !42 + %381 = fadd float %377, %380, !dbg !43 + %382 = and i32 %13, 15, !dbg !31 + %383 = icmp eq i32 %382, 0, !dbg !31 + %384 = and i1 %291, %383, !dbg !31 + %385 = bitcast float %376 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %292, <1 x i32> %385, i1 %384) #6, !dbg !31 + %386 = bitcast float %381 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %295, <1 x i32> %386, i1 %384) #6, !dbg !31 + %387 = bitcast float %371 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %298, <1 x i32> %387, i1 %384) #6, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31 + %388 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !31 + %389 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !31 + %390 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !45 + %391 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %390, i1 %12) #6, !dbg !45 + %392 = getelementptr bfloat, ptr addrspace(1) %3, i64 %33, !dbg !46 + %393 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !47 + %394 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %392, i64 %393, i1 true) #6, !dbg !47 + %395 = getelementptr bfloat, ptr addrspace(1) %4, i64 %33, !dbg !48 + %396 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %397 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %395, i64 %396, i1 true) #6, !dbg !49 + %398 = tail call float @llvm.nvvm.div.full(float %389, float 4.096000e+03), !dbg !50 + %399 = fadd float %398, 0x3EB0C6F7A0000000, !dbg !51 + %400 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %401 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i19 = icmp eq i32 %407, 0, !dbg !52 + br i1 %.not.i19, label %410, label %408, !dbg !52 + +408: ; preds = %__nv_rsqrtf.exit + %409 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %399), !dbg !52 + br label %__nv_rsqrtf.exit21, !dbg !52 + +410: ; preds = %__nv_rsqrtf.exit + %411 = tail call float @llvm.nvvm.rsqrt.approx.f(float %399), !dbg !52 + br label %__nv_rsqrtf.exit21, !dbg !52 + +__nv_rsqrtf.exit21: ; preds = %408, %410 + %.0.i20 = phi float [ %409, %408 ], [ %411, %410 ], !dbg !52 + %412 = extractvalue { i32, i32, i32, i32 } %391, 3, !dbg !45 + %413 = bitcast i32 %412 to <2 x bfloat>, !dbg !45 + %414 = extractvalue { i32, i32, i32, i32 } %391, 2, !dbg !45 + %415 = bitcast i32 %414 to <2 x bfloat>, !dbg !45 + %416 = extractvalue { i32, i32, i32, i32 } %391, 1, !dbg !45 + %417 = bitcast i32 %416 to <2 x bfloat>, !dbg !45 + %418 = extractvalue { i32, i32, i32, i32 } %391, 0, !dbg !45 + %419 = bitcast i32 %418 to <2 x bfloat>, !dbg !45 + %420 = extractvalue { i32, i32, i32, i32 } %397, 3, !dbg !49 + %421 = bitcast i32 %420 to <2 x bfloat>, !dbg !49 + %422 = extractvalue { i32, i32, i32, i32 } %397, 2, !dbg !49 + %423 = bitcast i32 %422 to <2 x bfloat>, !dbg !49 + %424 = extractvalue { i32, i32, i32, i32 } %397, 1, !dbg !49 + %425 = bitcast i32 %424 to <2 x bfloat>, !dbg !49 + %426 = extractvalue { i32, i32, i32, i32 } %397, 0, !dbg !49 + %427 = bitcast i32 %426 to <2 x bfloat>, !dbg !49 + %428 = extractvalue { i32, i32, i32, i32 } %394, 3, !dbg !47 + %429 = bitcast i32 %428 to <2 x bfloat>, !dbg !47 + %430 = extractvalue { i32, i32, i32, i32 } %394, 2, !dbg !47 + %431 = bitcast i32 %430 to <2 x bfloat>, !dbg !47 + %432 = extractvalue { i32, i32, i32, i32 } %394, 1, !dbg !47 + %433 = bitcast i32 %432 to <2 x bfloat>, !dbg !47 + %434 = extractvalue { i32, i32, i32, i32 } %394, 0, !dbg !47 + %435 = bitcast i32 %434 to <2 x bfloat>, !dbg !47 + %436 = getelementptr bfloat, ptr addrspace(1) %6, i64 %21, !dbg !53 + %437 = fpext <2 x bfloat> %419 to <2 x float>, !dbg !54 + %438 = insertelement <2 x float> poison, float %388, i64 0, !dbg !55 + %439 = shufflevector <2 x float> %438, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !55 + %440 = fsub <2 x float> %437, %439, !dbg !55 + %441 = fpext <2 x bfloat> %427 to <2 x float>, !dbg !56 + %442 = fpext <2 x bfloat> %435 to <2 x float>, !dbg !57 + %443 = insertelement <2 x float> poison, float %.0.i20, i64 0, !dbg !58 + %444 = shufflevector <2 x float> %443, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !58 + %445 = fmul <2 x float> %440, %444, !dbg !58 + %446 = fadd <2 x float> %442, splat (float 1.000000e+00), !dbg !59 + %447 = fmul <2 x float> %446, %445, !dbg !60 + %448 = fadd <2 x float> %447, %441, !dbg !61 + %449 = fptrunc <2 x float> %448 to <2 x bfloat>, !dbg !62 + %450 = fpext <2 x bfloat> %417 to <2 x float>, !dbg !54 + %451 = fsub <2 x float> %450, %439, !dbg !55 + %452 = fpext <2 x bfloat> %425 to <2 x float>, !dbg !56 + %453 = fpext <2 x bfloat> %433 to <2 x float>, !dbg !57 + %454 = fmul <2 x float> %451, %444, !dbg !58 + %455 = fadd <2 x float> %453, splat (float 1.000000e+00), !dbg !59 + %456 = fmul <2 x float> %455, %454, !dbg !60 + %457 = fadd <2 x float> %456, %452, !dbg !61 + %458 = fptrunc <2 x float> %457 to <2 x bfloat>, !dbg !62 + %459 = fpext <2 x bfloat> %415 to <2 x float>, !dbg !54 + %460 = fsub <2 x float> %459, %439, !dbg !55 + %461 = fpext <2 x bfloat> %423 to <2 x float>, !dbg !56 + %462 = fpext <2 x bfloat> %431 to <2 x float>, !dbg !57 + %463 = fmul <2 x float> %460, %444, !dbg !58 + %464 = fadd <2 x float> %462, splat (float 1.000000e+00), !dbg !59 + %465 = fmul <2 x float> %464, %463, !dbg !60 + %466 = fadd <2 x float> %465, %461, !dbg !61 + %467 = fptrunc <2 x float> %466 to <2 x bfloat>, !dbg !62 + %468 = fpext <2 x bfloat> %413 to <2 x float>, !dbg !54 + %469 = fsub <2 x float> %468, %439, !dbg !55 + %470 = fpext <2 x bfloat> %421 to <2 x float>, !dbg !56 + %471 = fpext <2 x bfloat> %429 to <2 x float>, !dbg !57 + %472 = fmul <2 x float> %469, %444, !dbg !58 + %473 = fadd <2 x float> %471, splat (float 1.000000e+00), !dbg !59 + %474 = fmul <2 x float> %473, %472, !dbg !60 + %475 = fadd <2 x float> %474, %470, !dbg !61 + %476 = fptrunc <2 x float> %475 to <2 x bfloat>, !dbg !62 + %477 = bitcast <2 x bfloat> %449 to i32, !dbg !62 + %478 = bitcast <2 x bfloat> %458 to i32, !dbg !62 + %479 = bitcast <2 x bfloat> %467 to i32, !dbg !62 + %480 = bitcast <2 x bfloat> %476 to i32, !dbg !62 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %477, i32 %478, i32 %479, i32 %480, ptr addrspace(1) %436, i1 %12) #6, !dbg !62 + ret void, !dbg !63 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 25, column: 21, scope: !5) +!10 = !DILocation(line: 26, column: 37, scope: !5) +!11 = !DILocation(line: 38, column: 46, scope: !5) +!12 = !DILocation(line: 38, column: 41, scope: !5) +!13 = !DILocation(line: 38, column: 34, scope: !5) +!14 = !DILocation(line: 38, column: 51, scope: !5) +!15 = !DILocation(line: 39, column: 34, scope: !5) +!16 = !DILocation(line: 39, column: 41, scope: !5) +!17 = !DILocation(line: 40, column: 34, scope: !5) +!18 = !DILocation(line: 40, column: 51, scope: !5) +!19 = !DILocation(line: 50, column: 66, scope: !5) +!20 = !DILocation(line: 51, column: 29, scope: !5) +!21 = !DILocation(line: 38, column: 113, scope: !5) +!22 = !DILocation(line: 39, column: 94, scope: !5) +!23 = !DILocation(line: 40, column: 113, scope: !5) +!24 = !DILocation(line: 41, column: 22, scope: !5) +!25 = !DILocation(line: 42, column: 22, scope: !5) +!26 = !DILocation(line: 48, column: 62, scope: !5) +!27 = !DILocation(line: 51, column: 52, scope: !5) +!28 = !DILocation(line: 231, column: 21, scope: !29, inlinedAt: !31) +!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0) +!30 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime") +!31 = !DILocation(line: 243, column: 46, scope: !29, inlinedAt: !32) +!32 = !DILocation(line: 52, column: 80, scope: !33) +!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!34 = !DILocation(line: 232, column: 28, scope: !29, inlinedAt: !31) +!35 = !DILocation(line: 233, column: 39, scope: !29, inlinedAt: !31) +!36 = !DILocation(line: 233, column: 60, scope: !29, inlinedAt: !31) +!37 = !DILocation(line: 233, column: 49, scope: !29, inlinedAt: !31) +!38 = !DILocation(line: 235, column: 25, scope: !29, inlinedAt: !31) +!39 = !DILocation(line: 235, column: 17, scope: !29, inlinedAt: !31) +!40 = !DILocation(line: 236, column: 30, scope: !29, inlinedAt: !31) +!41 = !DILocation(line: 236, column: 38, scope: !29, inlinedAt: !31) +!42 = !DILocation(line: 236, column: 49, scope: !29, inlinedAt: !31) +!43 = !DILocation(line: 236, column: 22, scope: !29, inlinedAt: !31) +!44 = !DILocation(line: 236, column: 15, scope: !29, inlinedAt: !31) +!45 = !DILocation(line: 62, column: 53, scope: !5) +!46 = !DILocation(line: 63, column: 35, scope: !5) +!47 = !DILocation(line: 63, column: 42, scope: !5) +!48 = !DILocation(line: 64, column: 35, scope: !5) +!49 = !DILocation(line: 64, column: 42, scope: !5) +!50 = !DILocation(line: 68, column: 25, scope: !5) +!51 = !DILocation(line: 70, column: 24, scope: !5) +!52 = !DILocation(line: 71, column: 32, scope: !5) +!53 = !DILocation(line: 78, column: 29, scope: !5) +!54 = !DILocation(line: 62, column: 115, scope: !5) +!55 = !DILocation(line: 66, column: 24, scope: !5) +!56 = !DILocation(line: 64, column: 95, scope: !5) +!57 = !DILocation(line: 63, column: 95, scope: !5) +!58 = !DILocation(line: 72, column: 24, scope: !5) +!59 = !DILocation(line: 75, column: 24, scope: !5) +!60 = !DILocation(line: 76, column: 24, scope: !5) +!61 = !DILocation(line: 77, column: 24, scope: !5) +!62 = !DILocation(line: 78, column: 53, scope: !5) +!63 = !DILocation(line: 56, column: 4, scope: !5) diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..b9b7da2e3299043e74f389ae7bfc1dd265132584 --- /dev/null +++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ptx @@ -0,0 +1,1129 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused_add_mul_native_layer_norm_0 +.visible .entry triton_red_fused_add_mul_native_layer_norm_0( + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_7, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_10 +) +.reqntid 512 +{ + .reg .pred %p<23>; + .reg .b16 %rs<49>; + .reg .b32 %r<323>; + .reg .b64 %rd<23>; + .loc 1 18 0 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd14, [triton_red_fused_add_mul_native_layer_norm_0_param_0]; + ld.param.b64 %rd15, [triton_red_fused_add_mul_native_layer_norm_0_param_1]; +$L__tmp0: + .loc 1 23 28 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:23:28 + mov.u32 %r49, %ctaid.x; + .loc 1 25 21 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:25:21 + setp.lt.u32 %p1, %r49, 256; + ld.param.b64 %rd16, [triton_red_fused_add_mul_native_layer_norm_0_param_2]; + ld.param.b64 %rd17, [triton_red_fused_add_mul_native_layer_norm_0_param_3]; + .loc 1 26 37 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:26:37 + mov.u32 %r50, %tid.x; + and.b32 %r51, %r50, 511; + ld.param.b64 %rd18, [triton_red_fused_add_mul_native_layer_norm_0_param_4]; + and.b32 %r52, %r50, 31; + ld.param.b64 %rd19, [triton_red_fused_add_mul_native_layer_norm_0_param_5]; + ld.param.b64 %rd20, [triton_red_fused_add_mul_native_layer_norm_0_param_6]; + shl.b32 %r53, %r50, 3; + and.b32 %r54, %r53, 4088; + .loc 1 38 46 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:46 + shl.b32 %r55, %r49, 12; + .loc 1 38 41 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:41 + or.b32 %r56, %r54, %r55; + .loc 1 38 34 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:34 + mul.wide.s32 %rd21, %r56, 2; + add.s64 %rd1, %rd14, %rd21; + .loc 1 38 51 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:51 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + .loc 1 39 34 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:34 + mul.wide.u32 %rd22, %r54, 2; + add.s64 %rd3, %rd15, %rd22; + .loc 1 39 41 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:41 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + mov.pred %p2, -1; + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 40 34 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:34 + add.s64 %rd5, %rd16, %rd21; + .loc 1 40 51 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:51 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + mov.u32 %r13, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 50 66 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:50:66 + selp.f32 %r57, 0f3F800000, 0f00000000, %p1; + .loc 1 51 29 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:29 + add.s64 %rd7, %rd19, %rd21; + .loc 1 38 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r58, %rs1; + cvt.f32.bf16 %r59, %rs2; + .loc 1 39 94 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94 + mov.b32 {%rs3, %rs4}, %r6; + cvt.f32.bf16 %r60, %rs3; + cvt.f32.bf16 %r61, %rs4; + .loc 1 40 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113 + mov.b32 {%rs5, %rs6}, %r10; + cvt.f32.bf16 %r62, %rs5; + cvt.f32.bf16 %r63, %rs6; + .loc 1 42 22 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22 + fma.rn.f32 %r64, %r61, %r63, %r59; + fma.rn.f32 %r65, %r60, %r62, %r58; + .loc 1 48 62 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62 + selp.f32 %r66, %r65, 0f00000000, %p1; + selp.f32 %r67, %r64, 0f00000000, %p1; + .loc 1 51 52 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52 + cvt.rn.bf16x2.f32 %r14, %r64, %r65; + .loc 1 38 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113 + mov.b32 {%rs7, %rs8}, %r2; + cvt.f32.bf16 %r68, %rs7; + cvt.f32.bf16 %r69, %rs8; + .loc 1 39 94 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94 + mov.b32 {%rs9, %rs10}, %r7; + cvt.f32.bf16 %r70, %rs9; + cvt.f32.bf16 %r71, %rs10; + .loc 1 40 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113 + mov.b32 {%rs11, %rs12}, %r11; + cvt.f32.bf16 %r72, %rs11; + cvt.f32.bf16 %r73, %rs12; + .loc 1 42 22 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22 + fma.rn.f32 %r74, %r71, %r73, %r69; + fma.rn.f32 %r75, %r70, %r72, %r68; + .loc 1 48 62 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62 + selp.f32 %r76, %r75, 0f00000000, %p1; + selp.f32 %r77, %r74, 0f00000000, %p1; + .loc 1 51 52 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52 + cvt.rn.bf16x2.f32 %r15, %r74, %r75; + .loc 1 38 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113 + mov.b32 {%rs13, %rs14}, %r3; + cvt.f32.bf16 %r78, %rs13; + cvt.f32.bf16 %r79, %rs14; + .loc 1 39 94 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94 + mov.b32 {%rs15, %rs16}, %r8; + cvt.f32.bf16 %r80, %rs15; + cvt.f32.bf16 %r81, %rs16; + .loc 1 40 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113 + mov.b32 {%rs17, %rs18}, %r12; + cvt.f32.bf16 %r82, %rs17; + cvt.f32.bf16 %r83, %rs18; + .loc 1 42 22 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22 + fma.rn.f32 %r84, %r81, %r83, %r79; + fma.rn.f32 %r85, %r80, %r82, %r78; + .loc 1 48 62 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62 + selp.f32 %r86, %r85, 0f00000000, %p1; + selp.f32 %r87, %r84, 0f00000000, %p1; + .loc 1 51 52 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52 + cvt.rn.bf16x2.f32 %r16, %r84, %r85; + .loc 1 38 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113 + mov.b32 {%rs19, %rs20}, %r4; + cvt.f32.bf16 %r88, %rs19; + cvt.f32.bf16 %r89, %rs20; + .loc 1 39 94 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94 + mov.b32 {%rs21, %rs22}, %r9; + cvt.f32.bf16 %r90, %rs21; + cvt.f32.bf16 %r91, %rs22; + .loc 1 40 113 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113 + mov.b32 {%rs23, %rs24}, %r13; + cvt.f32.bf16 %r92, %rs23; + cvt.f32.bf16 %r93, %rs24; + .loc 1 42 22 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22 + fma.rn.f32 %r94, %r91, %r93, %r89; + fma.rn.f32 %r95, %r90, %r92, %r88; + .loc 1 48 62 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62 + selp.f32 %r96, %r95, 0f00000000, %p1; + selp.f32 %r97, %r94, 0f00000000, %p1; + .loc 1 51 52 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52 + cvt.rn.bf16x2.f32 %r17, %r94, %r95; + // begin inline asm + @%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r14, %r15, %r16, %r17 }; + // end inline asm +$L__tmp1: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r98, %r67, %r66; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r99, 0f40000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p6, %r99, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r100, %r57, %r99; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r101, 0f00000000, %r100, %p6; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r102, %r101, %r98, %r66; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r103, %r98, %r98; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r104, %r57, %r103; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r105, %r101, %r104, 0f00000000; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r106, %r76, %r102; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r107, 0f40400000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p7, %r107, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r108, %r57, %r107; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r109, 0f00000000, %r108, %p7; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r110, %r109, %r106, %r102; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r111, %r106, %r106; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r112, %r99, %r111; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r113, %r109, %r112, %r105; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r114, %r77, %r110; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r115, 0f40800000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p8, %r115, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r116, %r57, %r115; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r117, 0f00000000, %r116, %p8; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r118, %r117, %r114, %r110; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r119, %r114, %r114; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r120, %r107, %r119; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r121, %r117, %r120, %r113; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r122, %r86, %r118; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r123, 0f40A00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p9, %r123, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r124, %r57, %r123; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r125, 0f00000000, %r124, %p9; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r126, %r125, %r122, %r118; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r127, %r122, %r122; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r128, %r115, %r127; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r129, %r125, %r128, %r121; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r130, %r87, %r126; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r131, 0f40C00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p10, %r131, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r132, %r57, %r131; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r133, 0f00000000, %r132, %p10; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r134, %r133, %r130, %r126; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r135, %r130, %r130; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r136, %r123, %r135; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r137, %r133, %r136, %r129; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r138, %r96, %r134; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r139, 0f40E00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p11, %r139, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r140, %r57, %r139; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r141, 0f00000000, %r140, %p11; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r142, %r141, %r138, %r134; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r143, %r138, %r138; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r144, %r131, %r143; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r145, %r141, %r144, %r137; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r146, %r97, %r142; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r147, 0f41000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p12, %r147, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r148, %r57, %r147; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r149, 0f00000000, %r148, %p12; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r150, %r149, %r146, %r142; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r151, %r146, %r146; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r152, %r139, %r151; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r153, %r149, %r152, %r145; +$L__tmp2: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r154, %r150, 16, 31, -1; + shfl.sync.bfly.b32 %r155, %r153, 16, 31, -1; + shfl.sync.bfly.b32 %r156, %r147, 16, 31, -1; +$L__tmp3: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r157, %r154, %r150; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r158, %r147, %r156; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p13, %r158, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r159, %r156, %r158; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r160, 0f00000000, %r159, %p13; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r161, %r160, %r157, %r150; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r162, %r153, %r155; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r163, %r157, %r157; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r164, %r147, %r163; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r165, %r160, %r164, %r162; +$L__tmp4: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r166, %r161, 8, 31, -1; + shfl.sync.bfly.b32 %r167, %r165, 8, 31, -1; + shfl.sync.bfly.b32 %r168, %r158, 8, 31, -1; +$L__tmp5: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r169, %r166, %r161; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r170, %r158, %r168; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p14, %r170, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r171, %r168, %r170; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r172, 0f00000000, %r171, %p14; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r173, %r172, %r169, %r161; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r174, %r165, %r167; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r175, %r169, %r169; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r176, %r158, %r175; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r177, %r172, %r176, %r174; +$L__tmp6: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r178, %r173, 4, 31, -1; + shfl.sync.bfly.b32 %r179, %r177, 4, 31, -1; + shfl.sync.bfly.b32 %r180, %r170, 4, 31, -1; +$L__tmp7: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r181, %r178, %r173; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r182, %r170, %r180; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p15, %r182, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r183, %r180, %r182; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r184, 0f00000000, %r183, %p15; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r185, %r184, %r181, %r173; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r186, %r177, %r179; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r187, %r181, %r181; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r188, %r170, %r187; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r189, %r184, %r188, %r186; +$L__tmp8: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r190, %r185, 2, 31, -1; + shfl.sync.bfly.b32 %r191, %r189, 2, 31, -1; + shfl.sync.bfly.b32 %r192, %r182, 2, 31, -1; +$L__tmp9: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r193, %r190, %r185; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r194, %r182, %r192; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p16, %r194, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r195, %r192, %r194; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r196, 0f00000000, %r195, %p16; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r197, %r196, %r193, %r185; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r198, %r189, %r191; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r199, %r193, %r193; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r200, %r182, %r199; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r201, %r196, %r200, %r198; +$L__tmp10: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r202, %r197, 1, 31, -1; + shfl.sync.bfly.b32 %r203, %r201, 1, 31, -1; + shfl.sync.bfly.b32 %r204, %r194, 1, 31, -1; +$L__tmp11: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r205, %r202, %r197; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r23, %r194, %r204; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p17, %r23, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r206, %r204, %r23; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r207, 0f00000000, %r206, %p17; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r19, %r207, %r205, %r197; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r208, %r201, %r203; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r209, %r205, %r205; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r210, %r194, %r209; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r21, %r207, %r210, %r208; +$L__tmp12: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + setp.eq.b32 %p3, %r52, 0; + shr.u32 %r211, %r50, 3; + and.b32 %r212, %r211, 60; + mov.b32 %r213, global_smem; + add.s32 %r18, %r213, %r212; + // begin inline asm + @%p3 st.shared.b32 [ %r18 + 0 ], %r19; + // end inline asm + add.s32 %r20, %r18, 64; + // begin inline asm + @%p3 st.shared.b32 [ %r20 + 0 ], %r21; + // end inline asm + add.s32 %r22, %r18, 128; + // begin inline asm + @%p3 st.shared.b32 [ %r22 + 0 ], %r23; + // end inline asm + bar.sync 0; + setp.lt.u32 %p4, %r51, 16; + shl.b32 %r214, %r51, 2; + add.s32 %r25, %r213, %r214; + // begin inline asm + @%p4 ld.shared.b32 %r24, [ %r25 + 0 ]; + // end inline asm + add.s32 %r27, %r25, 64; + // begin inline asm + @%p4 ld.shared.b32 %r26, [ %r27 + 0 ]; + // end inline asm + add.s32 %r29, %r25, 128; + // begin inline asm + @%p4 ld.shared.b32 %r28, [ %r29 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r215, %r24, 8, 31, -1; + shfl.sync.bfly.b32 %r216, %r26, 8, 31, -1; + shfl.sync.bfly.b32 %r217, %r28, 8, 31, -1; +$L__tmp13: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r218, %r215, %r24; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r219, %r28, %r217; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p18, %r219, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r220, %r217, %r219; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r221, 0f00000000, %r220, %p18; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r222, %r218, %r221, %r24; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r223, %r26, %r216; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r224, %r218, %r218; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r225, %r224, %r28; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r226, %r225, %r221, %r223; +$L__tmp14: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r227, %r222, 4, 31, -1; + shfl.sync.bfly.b32 %r228, %r226, 4, 31, -1; + shfl.sync.bfly.b32 %r229, %r219, 4, 31, -1; +$L__tmp15: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r230, %r227, %r222; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r231, %r219, %r229; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p19, %r231, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r232, %r229, %r231; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r233, 0f00000000, %r232, %p19; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r234, %r230, %r233, %r222; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r235, %r226, %r228; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r236, %r230, %r230; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r237, %r219, %r236; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r238, %r233, %r237, %r235; +$L__tmp16: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r239, %r234, 2, 31, -1; + shfl.sync.bfly.b32 %r240, %r238, 2, 31, -1; + shfl.sync.bfly.b32 %r241, %r231, 2, 31, -1; +$L__tmp17: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r242, %r239, %r234; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r243, %r231, %r241; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p20, %r243, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r244, %r241, %r243; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r245, 0f00000000, %r244, %p20; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r246, %r242, %r245, %r234; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r247, %r238, %r240; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r248, %r242, %r242; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r249, %r231, %r248; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r250, %r245, %r249, %r247; +$L__tmp18: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + shfl.sync.bfly.b32 %r251, %r246, 1, 31, -1; + shfl.sync.bfly.b32 %r252, %r250, 1, 31, -1; + shfl.sync.bfly.b32 %r253, %r243, 1, 31, -1; +$L__tmp19: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + sub.f32 %r254, %r251, %r246; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r32, %r243, %r253; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + setp.eq.f32 %p21, %r32, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + div.full.f32 %r255, %r253, %r32; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + selp.f32 %r256, 0f00000000, %r255, %p21; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r30, %r254, %r256, %r246; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + add.f32 %r257, %r250, %r252; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r258, %r254, %r254; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + mul.f32 %r259, %r243, %r258; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ] + fma.rn.f32 %r31, %r256, %r259, %r257; +$L__tmp20: + .loc 2 243 46 // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] + and.b32 %r260, %r50, 15; + setp.eq.b32 %p22, %r260, 0; + and.pred %p5, %p4, %p22; + // begin inline asm + @%p5 st.shared.b32 [ %r25 + 0 ], %r30; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r27 + 0 ], %r31; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r29 + 0 ], %r32; + // end inline asm + bar.sync 0; + ld.shared.b32 %r261, [global_smem]; + ld.shared.b32 %r262, [global_smem+64]; +$L__tmp21: + .loc 1 62 53 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:53 + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r33, %r5; + mov.u32 %r34, %r5; + mov.u32 %r35, %r5; + mov.u32 %r36, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd7 + 0 ], %rd8; + // end inline asm + .loc 1 63 35 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:35 + add.s64 %rd9, %rd17, %rd22; + .loc 1 63 42 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:42 + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r37, %r5; + mov.u32 %r38, %r5; + mov.u32 %r39, %r5; + mov.u32 %r40, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r37, %r38, %r39, %r40 }, [ %rd9 + 0 ], %rd10; + // end inline asm + .loc 1 64 35 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:35 + add.s64 %rd11, %rd18, %rd22; + .loc 1 64 42 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:42 + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r41, %r5; + mov.u32 %r42, %r5; + mov.u32 %r43, %r5; + mov.u32 %r44, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd11 + 0 ], %rd12; + // end inline asm + mov.b32 %r263, 0f45800000; + .loc 1 68 25 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:68:25 + div.full.f32 %r264, %r262, %r263; + .loc 1 70 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:70:24 + add.f32 %r265, %r264, 0f358637BD; + .loc 1 71 32 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:71:32 + rsqrt.approx.ftz.f32 %r266, %r265; + .loc 1 78 29 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:29 + add.s64 %rd13, %rd20, %rd21; + .loc 1 62 115 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115 + mov.b32 {%rs25, %rs26}, %r33; + cvt.f32.bf16 %r267, %rs26; + cvt.f32.bf16 %r268, %rs25; + .loc 1 66 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24 + sub.f32 %r269, %r268, %r261; + sub.f32 %r270, %r267, %r261; + .loc 1 64 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95 + mov.b32 {%rs27, %rs28}, %r41; + cvt.f32.bf16 %r271, %rs28; + cvt.f32.bf16 %r272, %rs27; + .loc 1 63 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95 + mov.b32 {%rs29, %rs30}, %r37; + cvt.f32.bf16 %r273, %rs29; + cvt.f32.bf16 %r274, %rs30; + .loc 1 72 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24 + mul.f32 %r275, %r270, %r266; + mul.f32 %r276, %r269, %r266; + .loc 1 75 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24 + add.f32 %r277, %r274, 0f3F800000; + add.f32 %r278, %r273, 0f3F800000; + .loc 1 77 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24 + fma.rn.f32 %r279, %r278, %r276, %r272; + fma.rn.f32 %r280, %r277, %r275, %r271; + .loc 1 78 53 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53 + cvt.rn.bf16x2.f32 %r45, %r280, %r279; + .loc 1 62 115 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115 + mov.b32 {%rs31, %rs32}, %r34; + cvt.f32.bf16 %r281, %rs32; + cvt.f32.bf16 %r282, %rs31; + .loc 1 66 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24 + sub.f32 %r283, %r282, %r261; + sub.f32 %r284, %r281, %r261; + .loc 1 64 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95 + mov.b32 {%rs33, %rs34}, %r42; + cvt.f32.bf16 %r285, %rs34; + cvt.f32.bf16 %r286, %rs33; + .loc 1 63 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95 + mov.b32 {%rs35, %rs36}, %r38; + cvt.f32.bf16 %r287, %rs35; + cvt.f32.bf16 %r288, %rs36; + .loc 1 72 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24 + mul.f32 %r289, %r284, %r266; + mul.f32 %r290, %r283, %r266; + .loc 1 75 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24 + add.f32 %r291, %r288, 0f3F800000; + add.f32 %r292, %r287, 0f3F800000; + .loc 1 77 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24 + fma.rn.f32 %r293, %r292, %r290, %r286; + fma.rn.f32 %r294, %r291, %r289, %r285; + .loc 1 78 53 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53 + cvt.rn.bf16x2.f32 %r46, %r294, %r293; + .loc 1 62 115 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115 + mov.b32 {%rs37, %rs38}, %r35; + cvt.f32.bf16 %r295, %rs38; + cvt.f32.bf16 %r296, %rs37; + .loc 1 66 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24 + sub.f32 %r297, %r296, %r261; + sub.f32 %r298, %r295, %r261; + .loc 1 64 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95 + mov.b32 {%rs39, %rs40}, %r43; + cvt.f32.bf16 %r299, %rs40; + cvt.f32.bf16 %r300, %rs39; + .loc 1 63 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95 + mov.b32 {%rs41, %rs42}, %r39; + cvt.f32.bf16 %r301, %rs41; + cvt.f32.bf16 %r302, %rs42; + .loc 1 72 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24 + mul.f32 %r303, %r298, %r266; + mul.f32 %r304, %r297, %r266; + .loc 1 75 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24 + add.f32 %r305, %r302, 0f3F800000; + add.f32 %r306, %r301, 0f3F800000; + .loc 1 77 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24 + fma.rn.f32 %r307, %r306, %r304, %r300; + fma.rn.f32 %r308, %r305, %r303, %r299; + .loc 1 78 53 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53 + cvt.rn.bf16x2.f32 %r47, %r308, %r307; + .loc 1 62 115 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115 + mov.b32 {%rs43, %rs44}, %r36; + cvt.f32.bf16 %r309, %rs44; + cvt.f32.bf16 %r310, %rs43; + .loc 1 66 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24 + sub.f32 %r311, %r310, %r261; + sub.f32 %r312, %r309, %r261; + .loc 1 64 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95 + mov.b32 {%rs45, %rs46}, %r44; + cvt.f32.bf16 %r313, %rs46; + cvt.f32.bf16 %r314, %rs45; + .loc 1 63 95 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95 + mov.b32 {%rs47, %rs48}, %r40; + cvt.f32.bf16 %r315, %rs47; + cvt.f32.bf16 %r316, %rs48; + .loc 1 72 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24 + mul.f32 %r317, %r312, %r266; + mul.f32 %r318, %r311, %r266; + .loc 1 75 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24 + add.f32 %r319, %r316, 0f3F800000; + add.f32 %r320, %r315, 0f3F800000; + .loc 1 77 24 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24 + fma.rn.f32 %r321, %r320, %r318, %r314; + fma.rn.f32 %r322, %r319, %r317, %r313; + .loc 1 78 53 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53 + cvt.rn.bf16x2.f32 %r48, %r322, %r321; + // begin inline asm + @%p1 st.global.v4.b32 [ %rd13 + 0 ], { %r45, %r46, %r47, %r48 }; + // end inline asm + .loc 1 56 4 // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:56:4 + ret; +$L__tmp22: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 343 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x150 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 97 +.b8 51 +.b8 109 +.b8 101 +.b8 110 +.b8 108 +.b8 102 +.b8 117 +.b8 108 +.b8 100 +.b8 116 +.b8 104 +.b8 103 +.b8 109 +.b8 110 +.b8 99 +.b8 102 +.b8 112 +.b8 106 +.b8 107 +.b8 52 +.b8 53 +.b8 50 +.b8 120 +.b8 107 +.b8 114 +.b8 111 +.b8 115 +.b8 55 +.b8 105 +.b8 100 +.b8 114 +.b8 109 +.b8 105 +.b8 108 +.b8 54 +.b8 112 +.b8 99 +.b8 111 +.b8 101 +.b8 105 +.b8 103 +.b8 114 +.b8 97 +.b8 121 +.b8 109 +.b8 99 +.b8 103 +.b8 52 +.b8 101 +.b8 54 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 97 +.b8 51 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 97 +.b8 116 +.b8 105 +.b8 118 +.b8 101 +.b8 95 +.b8 108 +.b8 97 +.b8 121 +.b8 101 +.b8 114 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x113:0x47 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x128:0x31 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp21 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 80 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x140:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp20 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 243 // DW_AT_call_line +.b8 46 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.source new file mode 100644 index 0000000000000000000000000000000000000000..96dc836a52c39d65411d4fbc041c8eda06e50f38 --- /dev/null +++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.source @@ -0,0 +1,486 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0) +#loc88 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0) +#loc101 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0) +#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0) +#loc107 = loc(unknown) +#loc110 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0) +#loc125 = loc("in_ptr0"(#loc)) +#loc126 = loc("in_ptr1"(#loc)) +#loc127 = loc("in_ptr2"(#loc)) +#loc128 = loc("in_ptr3"(#loc)) +#loc129 = loc("in_ptr4"(#loc)) +#loc130 = loc("out_ptr0"(#loc)) +#loc131 = loc("out_ptr3"(#loc)) +#loc132 = loc("xnumel"(#loc)) +#loc133 = loc("r0_numel"(#loc)) +#loc201 = loc("value"(#loc88)) +#loc202 = loc("mean"(#loc88)) +#loc203 = loc("m2"(#loc88)) +#loc204 = loc("weight"(#loc88)) +#loc205 = loc("first_iteration"(#loc88)) +#loc215 = loc("input"(#loc101)) +#loc216 = loc("mean"(#loc105)) +#loc217 = loc("m2"(#loc105)) +#loc218 = loc("weight"(#loc105)) +#loc219 = loc("mean_1"(#loc110)) +#loc220 = loc("m2_1"(#loc110)) +#loc221 = loc("weight_1"(#loc110)) +#loc222 = loc("mean_2"(#loc110)) +#loc223 = loc("m2_2"(#loc110)) +#loc224 = loc("weight_2"(#loc110)) +#loc231 = loc("new_mean"(#loc201)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 256 : i32 loc(#loc134) + %r0_numel_1 = arith.constant 4096 : i32 loc(#loc135) + %xoffset = tt.get_program_id x : i32 loc(#loc136) + %xoffset_2 = arith.constant 1 : i32 loc(#loc137) + %xoffset_3 = arith.constant 1 : i32 loc(#loc137) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc137) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc138) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc139) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc140) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc140) + %xmask = arith.constant dense<256> : tensor<1x1xi32> loc(#loc141) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc141) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc142) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc143) + %tmp7_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc144) + %tmp7_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc145) + %tmp7_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc146) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c4096_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %tmp7_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp7_mean_13 = %tmp7_mean, %tmp7_m2_14 = %tmp7_m2, %tmp7_weight_15 = %tmp7_weight) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc148) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc148) + %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc149) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x4096xi32> loc(#loc149) + %tmp0 = arith.constant 4096 : i32 loc(#loc150) + %tmp0_18 = arith.constant 4096 : i32 loc(#loc150) + %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc150) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc150) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc151) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x4096xi32> loc(#loc151) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc152) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc152) + %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc153) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x4096xi1> loc(#loc153) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc154) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc154) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc154) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc154) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc155) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc156) + %tmp1_32 = tt.addptr %tmp1, %r0_index_16 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc156) + %tmp1_33 = arith.constant 0.000000e+00 : f32 loc(#loc157) + %tmp1_34 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc157) + %tmp1_35 = arith.truncf %tmp1_34 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc157) + %tmp1_36 = tt.load %tmp1_32, %r0_mask_17, %tmp1_35 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc157) + %tmp1_37 = arith.extf %tmp1_36 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc158) + %tmp2 = arith.constant 4096 : i32 loc(#loc159) + %tmp2_38 = arith.constant 4096 : i32 loc(#loc159) + %tmp2_39 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc159) + %tmp2_40 = arith.muli %tmp2_39, %xindex_7 : tensor<1x1xi32> loc(#loc159) + %tmp2_41 = tt.broadcast %tmp2_40 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc160) + %tmp2_42 = arith.addi %r0_index_16, %tmp2_41 : tensor<1x4096xi32> loc(#loc160) + %tmp2_43 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc161) + %tmp2_44 = tt.addptr %tmp2_43, %tmp2_42 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc161) + %tmp2_45 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc162) + %tmp2_46 = arith.andi %r0_mask_17, %tmp2_45 : tensor<1x4096xi1> loc(#loc162) + %tmp2_47 = arith.constant 0.000000e+00 : f32 loc(#loc163) + %tmp2_48 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc163) + %tmp2_49 = arith.truncf %tmp2_48 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc163) + %tmp2_50 = tt.load %tmp2_44, %tmp2_46, %tmp2_49 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc163) + %tmp2_51 = arith.extf %tmp2_50 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc164) + %tmp3 = arith.mulf %tmp1_37, %tmp2_51 : tensor<1x4096xf32> loc(#loc165) + %tmp4 = arith.addf %tmp0_31, %tmp3 : tensor<1x4096xf32> loc(#loc166) + %c0_i32_52 = arith.constant 0 : i32 loc(#loc34) + %9 = arith.cmpi eq, %r0_offset, %c0_i32_52 : i32 loc(#loc34) + %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%tmp4, %tmp7_mean_13, %tmp7_m2_14, %tmp7_weight_15, %9) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, i1) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) loc(#loc35) + %tmp7_mean_53 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc167) + %tmp7_mean_54 = arith.andi %r0_mask_17, %tmp7_mean_53 : tensor<1x4096xi1> loc(#loc167) + %tmp7_mean_55 = arith.select %tmp7_mean_54, %10#0, %tmp7_mean_13 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc168) + %tmp7_m2_56 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc169) + %tmp7_m2_57 = arith.andi %r0_mask_17, %tmp7_m2_56 : tensor<1x4096xi1> loc(#loc169) + %tmp7_m2_58 = arith.select %tmp7_m2_57, %10#1, %tmp7_m2_14 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc170) + %tmp7_weight_59 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc171) + %tmp7_weight_60 = arith.andi %r0_mask_17, %tmp7_weight_59 : tensor<1x4096xi1> loc(#loc171) + %tmp7_weight_61 = arith.select %tmp7_weight_60, %10#2, %tmp7_weight_15 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc172) + %c4096_i32_62 = arith.constant 4096 : i32 loc(#loc42) + %c4096_i32_63 = arith.constant 4096 : i32 loc(#loc42) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc42) + %11 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc42) + %12 = tt.broadcast %11 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc43) + %13 = arith.addi %r0_index_16, %12 : tensor<1x4096xi32> loc(#loc43) + %14 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc44) + %15 = tt.addptr %14, %13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc44) + %16 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc45) + %17 = arith.andi %r0_mask_17, %16 : tensor<1x4096xi1> loc(#loc45) + %18 = arith.truncf %tmp4 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc46) + tt.store %15, %18, %17 : tensor<1x4096x!tt.ptr> loc(#loc46) + scf.yield %tmp7_mean_55, %tmp7_m2_58, %tmp7_weight_61 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc47) + } loc(#loc237) + %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%tmp7_weight_10#0, %tmp7_weight_10#1, %tmp7_weight_10#2) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc48) + %tmp7 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc173) + %tmp11 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc174) + %tmp12 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc175) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc52) + %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc52) + %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc52) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc52) + %7 = arith.bitcast %c4096_i32_12 : i32 to i32 loc(#loc52) + %8 = ub.poison : i32 loc(#loc52) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc176) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc176) + %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc177) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x4096xi32> loc(#loc177) + %tmp13 = arith.constant 4096 : i32 loc(#loc178) + %tmp13_15 = arith.constant 4096 : i32 loc(#loc178) + %tmp13_16 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc178) + %tmp13_17 = arith.muli %tmp13_16, %xindex_7 : tensor<1x1xi32> loc(#loc178) + %tmp13_18 = tt.broadcast %tmp13_17 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc179) + %tmp13_19 = arith.addi %r0_index_13, %tmp13_18 : tensor<1x4096xi32> loc(#loc179) + %tmp13_20 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc180) + %tmp13_21 = tt.addptr %tmp13_20, %tmp13_19 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc180) + %tmp13_22 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc181) + %tmp13_23 = arith.andi %r0_mask_14, %tmp13_22 : tensor<1x4096xi1> loc(#loc181) + %tmp13_24 = arith.constant 0.000000e+00 : f32 loc(#loc182) + %tmp13_25 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc182) + %tmp13_26 = arith.truncf %tmp13_25 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc182) + %tmp13_27 = tt.load %tmp13_21, %tmp13_23, %tmp13_26 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc182) + %tmp13_28 = arith.extf %tmp13_27 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc183) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc184) + %tmp23_29 = tt.addptr %tmp23, %r0_index_13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc184) + %tmp23_30 = arith.constant 0.000000e+00 : f32 loc(#loc185) + %tmp23_31 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc185) + %tmp23_32 = arith.truncf %tmp23_31 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc185) + %tmp23_33 = tt.load %tmp23_29, %r0_mask_14, %tmp23_32 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc185) + %tmp23_34 = arith.extf %tmp23_33 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc186) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc187) + %tmp27_35 = tt.addptr %tmp27, %r0_index_13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc187) + %tmp27_36 = arith.constant 0.000000e+00 : f32 loc(#loc188) + %tmp27_37 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc188) + %tmp27_38 = arith.truncf %tmp27_37 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc188) + %tmp27_39 = tt.load %tmp27_35, %r0_mask_14, %tmp27_38 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc188) + %tmp27_40 = arith.extf %tmp27_39 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc189) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc190) + %tmp15_41 = arith.subf %tmp13_28, %tmp15 : tensor<1x4096xf32> loc(#loc190) + %tmp16 = arith.constant 4.096000e+03 : f32 loc(#loc191) + %tmp17 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc192) + %tmp17_42 = arith.divf %tmp11, %tmp17 : tensor<1x1xf32> loc(#loc192) + %tmp18 = arith.constant 9.99999997E-7 : f32 loc(#loc193) + %tmp19 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc194) + %tmp19_43 = arith.addf %tmp17_42, %tmp19 : tensor<1x1xf32> loc(#loc194) + %tmp20 = tt.extern_elementwise %tmp19_43 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc195) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc196) + %tmp21_44 = arith.mulf %tmp15_41, %tmp21 : tensor<1x4096xf32> loc(#loc196) + %tmp24 = arith.constant 1.000000e+00 : f32 loc(#loc197) + %tmp25 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc198) + %tmp25_45 = arith.addf %tmp23_34, %tmp25 : tensor<1x4096xf32> loc(#loc198) + %tmp26 = arith.mulf %tmp21_44, %tmp25_45 : tensor<1x4096xf32> loc(#loc199) + %tmp28 = arith.addf %tmp26, %tmp27_40 : tensor<1x4096xf32> loc(#loc200) + %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc78) + %c4096_i32_47 = arith.constant 4096 : i32 loc(#loc78) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc78) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc78) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc79) + %11 = arith.addi %r0_index_13, %10 : tensor<1x4096xi32> loc(#loc79) + %12 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc80) + %13 = tt.addptr %12, %11 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc80) + %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc81) + %15 = arith.andi %r0_mask_14, %14 : tensor<1x4096xi1> loc(#loc81) + %16 = arith.truncf %tmp28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc82) + tt.store %13, %16, %15 : tensor<1x4096x!tt.ptr> loc(#loc82) + } loc(#loc52) + tt.return loc(#loc83) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() -> tensor<1x4096xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc85) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc85) + tt.return %cst_0 : tensor<1x4096xf32> loc(#loc86) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x4096xf32> loc(#loc87) + tt.return %0 : tensor<1x4096xf32> loc(#loc87) + } loc(#loc84) + tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%new_mean: tensor<1x4096xf32> loc("new_mean"(#loc201)), %mean: tensor<1x4096xf32> loc("mean"(#loc88)), %m2: tensor<1x4096xf32> loc("m2"(#loc88)), %weight: tensor<1x4096xf32> loc("weight"(#loc88)), %first_iteration: i1 loc("first_iteration"(#loc88))) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) attributes {noinline = false} { + %0:3 = scf.if %first_iteration -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) { + %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc206) + %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc232) + %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_4096S__(%m2) : (tensor<1x4096xf32>) -> tensor<1x4096xf32> loc(#loc233) + scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc233) + } else { + %delta = arith.subf %new_mean, %mean : tensor<1x4096xf32> loc(#loc208) + %new_weight = arith.constant 1 : i32 loc(#loc209) + %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc209) + %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc209) + %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x4096xf32> loc(#loc234) + %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x4096xf32> loc(#loc210) + %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x4096xf32> loc(#loc235) + %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x4096xf32> loc(#loc212) + %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x4096xf32> loc(#loc213) + %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x4096xf32> loc(#loc236) + scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc214) + } loc(#loc89) + tt.return %0#1, %0#0, %0#2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc99) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x4096xf32> loc(#loc100) + %2 = ub.poison : tensor<1x4096xf32> loc(#loc100) + %3 = ub.poison : tensor<1x4096xf32> loc(#loc100) + tt.return %1, %2, %3 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc100) + } loc(#loc88) + tt.func private @triton.language.standard.zeros_like__fp32S1_4096S__(%input: tensor<1x4096xf32> loc("input"(#loc101))) -> tensor<1x4096xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc102) + tt.return %0 : tensor<1x4096xf32> loc(#loc103) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x4096xf32> loc(#loc104) + tt.return %1 : tensor<1x4096xf32> loc(#loc104) + } loc(#loc101) + tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%mean: tensor<1x4096xf32> loc("mean"(#loc105)), %m2: tensor<1x4096xf32> loc("m2"(#loc105)), %weight: tensor<1x4096xf32> loc("weight"(#loc105))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({ + ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): + %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc106) + tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc106) + }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc106) + tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc108) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc109) + %2 = ub.poison : tensor<1xf32> loc(#loc109) + %3 = ub.poison : tensor<1xf32> loc(#loc109) + tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc109) + } loc(#loc105) + tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc110)), %m2_1: f32 loc("m2_1"(#loc110)), %weight_1: f32 loc("weight_1"(#loc110)), %mean_2: f32 loc("mean_2"(#loc110)), %m2_2: f32 loc("m2_2"(#loc110)), %weight_2: f32 loc("weight_2"(#loc110))) -> (f32, f32, f32) attributes {noinline = false} { + %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc225) + %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc226) + %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc227) + %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc227) + %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc228) + %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc229) + %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc229) + %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc229) + %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc116) + %1 = arith.addf %mean_1, %0 : f32 loc(#loc117) + %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc118) + %3 = arith.mulf %delta, %delta : f32 loc(#loc119) + %4 = arith.mulf %3, %weight_1 : f32 loc(#loc120) + %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc121) + %6 = arith.addf %2, %5 : f32 loc(#loc122) + tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc123) + ^bb1: // no predecessors + %7 = ub.poison : f32 loc(#loc124) + %8 = ub.poison : f32 loc(#loc124) + %9 = ub.poison : f32 loc(#loc124) + tt.return %7, %8, %9 : f32, f32, f32 loc(#loc124) + } loc(#loc110) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":29:45) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":30:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":31:47) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:46) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:61) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:62) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:51) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:39) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:37) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:58) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:41) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:41) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:36) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:62) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:8) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":55:18) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:43) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":57:31) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":58:29) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:48) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:43) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:36) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:63) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":67:16) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":69:16) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":74:16) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:41) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:36) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:63) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4) +#loc84 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0) +#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31) +#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11) +#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4) +#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46) +#loc91 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11) +#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4) +#loc102 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30) +#loc103 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11) +#loc104 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4) +#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11) +#loc109 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4) +#loc111 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc112 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc113 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc114 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc115 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc116 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc117 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc118 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc119 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc120 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc121 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc122 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc123 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11) +#loc124 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4) +#loc134 = loc("xnumel"(#loc1)) +#loc135 = loc("r0_numel"(#loc2)) +#loc136 = loc("xoffset"(#loc3)) +#loc137 = loc("xoffset"(#loc4)) +#loc138 = loc("xindex"(#loc5)) +#loc139 = loc("xindex"(#loc6)) +#loc140 = loc("xindex"(#loc7)) +#loc141 = loc("xmask"(#loc8)) +#loc142 = loc("r0_base"(#loc9)) +#loc143 = loc("r0_base"(#loc10)) +#loc144 = loc("tmp7_mean"(#loc11)) +#loc145 = loc("tmp7_m2"(#loc12)) +#loc146 = loc("tmp7_weight"(#loc13)) +#loc147 = loc("tmp7_mean"(#loc14)) +#loc148 = loc("r0_index"(#loc15)) +#loc149 = loc("r0_mask"(#loc16)) +#loc150 = loc("tmp0"(#loc17)) +#loc151 = loc("tmp0"(#loc18)) +#loc152 = loc("tmp0"(#loc19)) +#loc153 = loc("tmp0"(#loc20)) +#loc154 = loc("tmp0"(#loc21)) +#loc155 = loc("tmp0"(#loc22)) +#loc156 = loc("tmp1"(#loc23)) +#loc157 = loc("tmp1"(#loc24)) +#loc158 = loc("tmp1"(#loc25)) +#loc159 = loc("tmp2"(#loc26)) +#loc160 = loc("tmp2"(#loc27)) +#loc161 = loc("tmp2"(#loc28)) +#loc162 = loc("tmp2"(#loc29)) +#loc163 = loc("tmp2"(#loc30)) +#loc164 = loc("tmp2"(#loc31)) +#loc165 = loc("tmp3"(#loc32)) +#loc166 = loc("tmp4"(#loc33)) +#loc167 = loc("tmp7_mean"(#loc36)) +#loc168 = loc("tmp7_mean"(#loc37)) +#loc169 = loc("tmp7_m2"(#loc38)) +#loc170 = loc("tmp7_m2"(#loc39)) +#loc171 = loc("tmp7_weight"(#loc40)) +#loc172 = loc("tmp7_weight"(#loc41)) +#loc173 = loc("tmp7"(#loc49)) +#loc174 = loc("tmp11"(#loc50)) +#loc175 = loc("tmp12"(#loc51)) +#loc176 = loc("r0_index"(#loc53)) +#loc177 = loc("r0_mask"(#loc54)) +#loc178 = loc("tmp13"(#loc55)) +#loc179 = loc("tmp13"(#loc56)) +#loc180 = loc("tmp13"(#loc57)) +#loc181 = loc("tmp13"(#loc58)) +#loc182 = loc("tmp13"(#loc59)) +#loc183 = loc("tmp13"(#loc60)) +#loc184 = loc("tmp23"(#loc61)) +#loc185 = loc("tmp23"(#loc62)) +#loc186 = loc("tmp23"(#loc63)) +#loc187 = loc("tmp27"(#loc64)) +#loc188 = loc("tmp27"(#loc65)) +#loc189 = loc("tmp27"(#loc66)) +#loc190 = loc("tmp15"(#loc67)) +#loc191 = loc("tmp16"(#loc68)) +#loc192 = loc("tmp17"(#loc69)) +#loc193 = loc("tmp18"(#loc70)) +#loc194 = loc("tmp19"(#loc71)) +#loc195 = loc("tmp20"(#loc72)) +#loc196 = loc("tmp21"(#loc73)) +#loc197 = loc("tmp24"(#loc74)) +#loc198 = loc("tmp25"(#loc75)) +#loc199 = loc("tmp26"(#loc76)) +#loc200 = loc("tmp28"(#loc77)) +#loc206 = loc("new_weight"(#loc90)) +#loc207 = loc("new_m2"(#loc91)) +#loc208 = loc("delta"(#loc92)) +#loc209 = loc("new_weight"(#loc93)) +#loc210 = loc("new_mean"(#loc94)) +#loc211 = loc("new_mean"(#loc95)) +#loc212 = loc("new_m2"(#loc96)) +#loc213 = loc("new_m2"(#loc97)) +#loc214 = loc("new_m2"(#loc98)) +#loc225 = loc("delta"(#loc111)) +#loc226 = loc("new_weight"(#loc112)) +#loc227 = loc("w2_over_w"(#loc113)) +#loc228 = loc("w2_over_w"(#loc114)) +#loc229 = loc("w2_over_w"(#loc115)) +#loc230 = loc("tmp7_m2"(#loc147)) +#loc232 = loc("new_weight"(#loc206)) +#loc233 = loc("new_m2"(#loc207)) +#loc234 = loc("new_weight"(#loc209)) +#loc235 = loc("new_mean"(#loc211)) +#loc236 = loc("new_m2"(#loc214)) +#loc237 = loc("tmp7_weight"(#loc230)) diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..70f0de11d71b911cb835e273e5d7d12e26cbb0e5 --- /dev/null +++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir @@ -0,0 +1,214 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0) +#loc1 = loc(unknown) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("in_ptr1"(#loc)) +#loc61 = loc("in_ptr2"(#loc)) +#loc62 = loc("in_ptr3"(#loc)) +#loc63 = loc("in_ptr4"(#loc)) +#loc64 = loc("out_ptr0"(#loc)) +#loc65 = loc("out_ptr3"(#loc)) +#loc66 = loc("xnumel"(#loc)) +#loc67 = loc("r0_numel"(#loc)) +#loc89 = loc(callsite(#loc1 at #loc25)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1x4096xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16, #blocked> loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc68) + %xmask = arith.cmpi slt, %xoffset, %c256_i32 : i32 loc(#loc69) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc70) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4096xi32, #blocked> loc(#loc70) + %r0_mask = arith.cmpi slt, %r0_base_6, %cst : tensor<1x4096xi32, #blocked> loc(#loc71) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc72) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32, #blocked> loc(#loc113) + %tmp0_8 = arith.addi %r0_base_6, %tmp0_7 : tensor<1x4096xi32, #blocked> loc(#loc73) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc74) + %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc74) + %tmp0_11 = tt.splat %xmask : i1 -> tensor<1x4096xi1, #blocked> loc(#loc114) + %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x4096xi1, #blocked> loc(#loc75) + %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr, #blocked> loc(#loc76) + %tmp0_14 = arith.extf %tmp0_13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc77) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc78) + %tmp1_15 = tt.addptr %tmp1, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc78) + %tmp1_16 = tt.load %tmp1_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc79) + %tmp1_17 = arith.extf %tmp1_16 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc80) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc81) + %tmp2_18 = tt.addptr %tmp2, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc81) + %tmp2_19 = tt.load %tmp2_18, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr, #blocked> loc(#loc82) + %tmp2_20 = arith.extf %tmp2_19 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc83) + %tmp3 = arith.mulf %tmp1_17, %tmp2_20 : tensor<1x4096xf32, #blocked> loc(#loc84) + %tmp4 = arith.addf %tmp0_14, %tmp3 : tensor<1x4096xf32, #blocked> loc(#loc85) + %tmp7_mean = arith.select %tmp0_12, %tmp4, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc86) + %tmp7_weight = arith.select %tmp0_12, %cst_3, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc87) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc22) + %1 = tt.addptr %0, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc22) + %2 = arith.truncf %tmp4 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc23) + tt.store %1, %2, %tmp0_12 : tensor<1x4096x!tt.ptr, #blocked> loc(#loc23) + %3:3 = "tt.reduce"(%tmp7_mean, %cst_2, %tmp7_weight) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc25)), %arg10: f32 loc(callsite(#loc1 at #loc25)), %arg11: f32 loc(callsite(#loc1 at #loc25)), %arg12: f32 loc(callsite(#loc1 at #loc25)), %arg13: f32 loc(callsite(#loc1 at #loc25)), %arg14: f32 loc(callsite(#loc1 at #loc25))): + %delta = arith.subf %arg12, %arg9 : f32 loc(#loc115) + %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc116) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc117) + %w2_over_w_30 = arith.divf %arg14, %new_weight : f32 loc(#loc118) + %w2_over_w_31 = arith.select %w2_over_w, %cst_1, %w2_over_w_30 : f32 loc(#loc119) + %7 = arith.mulf %delta, %w2_over_w_31 : f32 loc(#loc120) + %8 = arith.addf %arg9, %7 : f32 loc(#loc121) + %9 = arith.addf %arg10, %arg13 : f32 loc(#loc122) + %10 = arith.mulf %delta, %delta : f32 loc(#loc123) + %11 = arith.mulf %10, %arg11 : f32 loc(#loc124) + %12 = arith.mulf %11, %w2_over_w_31 : f32 loc(#loc125) + %13 = arith.addf %9, %12 : f32 loc(#loc126) + tt.reduce.return %8, %13, %new_weight : f32, f32, f32 loc(#loc88) + }) : (tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc88) + %tmp7 = tt.expand_dims %3#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc95) + %tmp11 = tt.expand_dims %3#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc96) + %tmp13 = tt.load %1, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr, #blocked> loc(#loc97) + %tmp13_21 = arith.extf %tmp13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc98) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc99) + %tmp23_22 = tt.addptr %tmp23, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc99) + %tmp23_23 = tt.load %tmp23_22, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc100) + %tmp23_24 = arith.extf %tmp23_23 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc101) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc102) + %tmp27_25 = tt.addptr %tmp27, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc102) + %tmp27_26 = tt.load %tmp27_25, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc103) + %tmp27_27 = arith.extf %tmp27_26 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc104) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc105) + %tmp15_28 = arith.subf %tmp13_21, %tmp15 : tensor<1x4096xf32, #blocked> loc(#loc105) + %tmp17 = arith.divf %tmp11, %cst_5 : tensor<1x1xf32, #blocked> loc(#loc106) + %tmp19 = arith.addf %tmp17, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc107) + %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc108) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc109) + %tmp21_29 = arith.mulf %tmp15_28, %tmp21 : tensor<1x4096xf32, #blocked> loc(#loc109) + %tmp25 = arith.addf %tmp23_24, %cst_3 : tensor<1x4096xf32, #blocked> loc(#loc110) + %tmp26 = arith.mulf %tmp21_29, %tmp25 : tensor<1x4096xf32, #blocked> loc(#loc111) + %tmp28 = arith.addf %tmp26, %tmp27_27 : tensor<1x4096xf32, #blocked> loc(#loc112) + %4 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc56) + %5 = tt.addptr %4, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc56) + %6 = arith.truncf %tmp28 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc57) + tt.store %5, %6, %tmp0_12 : tensor<1x4096x!tt.ptr, #blocked> loc(#loc57) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4) +#loc68 = loc("xoffset"(#loc2)) +#loc69 = loc("xmask"(#loc3)) +#loc70 = loc("r0_base"(#loc4)) +#loc71 = loc("r0_mask"(#loc5)) +#loc72 = loc("tmp0"(#loc6)) +#loc73 = loc("tmp0"(#loc7)) +#loc74 = loc("tmp0"(#loc8)) +#loc75 = loc("tmp0"(#loc9)) +#loc76 = loc("tmp0"(#loc10)) +#loc77 = loc("tmp0"(#loc11)) +#loc78 = loc("tmp1"(#loc12)) +#loc79 = loc("tmp1"(#loc13)) +#loc80 = loc("tmp1"(#loc14)) +#loc81 = loc("tmp2"(#loc15)) +#loc82 = loc("tmp2"(#loc16)) +#loc83 = loc("tmp2"(#loc17)) +#loc84 = loc("tmp3"(#loc18)) +#loc85 = loc("tmp4"(#loc19)) +#loc86 = loc("tmp7_mean"(#loc20)) +#loc87 = loc("tmp7_weight"(#loc21)) +#loc88 = loc(callsite(#loc24 at #loc25)) +#loc90 = loc("delta"(#loc26)) +#loc91 = loc("new_weight"(#loc27)) +#loc92 = loc("w2_over_w"(#loc28)) +#loc93 = loc("w2_over_w"(#loc29)) +#loc94 = loc("w2_over_w"(#loc30)) +#loc95 = loc("tmp7"(#loc38)) +#loc96 = loc("tmp11"(#loc39)) +#loc97 = loc("tmp13"(#loc40)) +#loc98 = loc("tmp13"(#loc41)) +#loc99 = loc("tmp23"(#loc42)) +#loc100 = loc("tmp23"(#loc43)) +#loc101 = loc("tmp23"(#loc44)) +#loc102 = loc("tmp27"(#loc45)) +#loc103 = loc("tmp27"(#loc46)) +#loc104 = loc("tmp27"(#loc47)) +#loc105 = loc("tmp15"(#loc48)) +#loc106 = loc("tmp17"(#loc49)) +#loc107 = loc("tmp19"(#loc50)) +#loc108 = loc("tmp20"(#loc51)) +#loc109 = loc("tmp21"(#loc52)) +#loc110 = loc("tmp25"(#loc53)) +#loc111 = loc("tmp26"(#loc54)) +#loc112 = loc("tmp28"(#loc55)) +#loc113 = loc(fused[#loc73, #loc72]) +#loc114 = loc(fused[#loc75, #loc69]) +#loc115 = loc(callsite(#loc90 at #loc88)) +#loc116 = loc(callsite(#loc91 at #loc88)) +#loc117 = loc(callsite(#loc92 at #loc88)) +#loc118 = loc(callsite(#loc93 at #loc88)) +#loc119 = loc(callsite(#loc94 at #loc88)) +#loc120 = loc(callsite(#loc31 at #loc88)) +#loc121 = loc(callsite(#loc32 at #loc88)) +#loc122 = loc(callsite(#loc33 at #loc88)) +#loc123 = loc(callsite(#loc34 at #loc88)) +#loc124 = loc(callsite(#loc35 at #loc88)) +#loc125 = loc(callsite(#loc36 at #loc88)) +#loc126 = loc(callsite(#loc37 at #loc88)) diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..78ff99218f52da14d1eb8bff343a9096cf0e656f --- /dev/null +++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttir @@ -0,0 +1,215 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0) +#loc1 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80) +#loc60 = loc("in_ptr0"(#loc)) +#loc61 = loc("in_ptr1"(#loc)) +#loc62 = loc("in_ptr2"(#loc)) +#loc63 = loc("in_ptr3"(#loc)) +#loc64 = loc("in_ptr4"(#loc)) +#loc65 = loc("out_ptr0"(#loc)) +#loc66 = loc("out_ptr3"(#loc)) +#loc67 = loc("xnumel"(#loc)) +#loc68 = loc("r0_numel"(#loc)) +#loc70 = loc(callsite(#loc1 at #loc3)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %xmask = arith.constant 256 : i32 loc(#loc69) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc70) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16> loc(#loc1) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc1) + %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1) + %cst_5 = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc71) + %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc69) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc72) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc73) + %r0_mask = arith.cmpi slt, %r0_base_7, %cst_5 : tensor<1x4096xi32> loc(#loc74) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc75) + %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32> loc(#loc115) + %tmp0_9 = arith.addi %r0_base_7, %tmp0_8 : tensor<1x4096xi32> loc(#loc76) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc77) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc77) + %tmp0_12 = tt.splat %xmask_6 : i1 -> tensor<1x4096xi1> loc(#loc116) + %tmp0_13 = arith.andi %r0_mask, %tmp0_12 : tensor<1x4096xi1> loc(#loc78) + %tmp0_14 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc79) + %tmp0_15 = arith.extf %tmp0_14 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc80) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc81) + %tmp1_16 = tt.addptr %tmp1, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc81) + %tmp1_17 = tt.load %tmp1_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc82) + %tmp1_18 = arith.extf %tmp1_17 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc83) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc84) + %tmp2_19 = tt.addptr %tmp2, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc84) + %tmp2_20 = tt.load %tmp2_19, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc85) + %tmp2_21 = arith.extf %tmp2_20 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc86) + %tmp3 = arith.mulf %tmp1_18, %tmp2_21 : tensor<1x4096xf32> loc(#loc87) + %tmp4 = arith.addf %tmp0_15, %tmp3 : tensor<1x4096xf32> loc(#loc88) + %tmp7_mean = arith.select %tmp0_13, %tmp4, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc89) + %tmp7_weight = arith.select %tmp0_13, %cst_2, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc90) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc24) + %1 = tt.addptr %0, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc24) + %2 = arith.truncf %tmp4 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc25) + tt.store %1, %2, %tmp0_13 : tensor<1x4096x!tt.ptr> loc(#loc25) + %3:3 = "tt.reduce"(%tmp7_mean, %cst_0, %tmp7_weight) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc3)), %arg10: f32 loc(callsite(#loc1 at #loc3)), %arg11: f32 loc(callsite(#loc1 at #loc3)), %arg12: f32 loc(callsite(#loc1 at #loc3)), %arg13: f32 loc(callsite(#loc1 at #loc3)), %arg14: f32 loc(callsite(#loc1 at #loc3))): + %delta = arith.subf %arg12, %arg9 : f32 loc(#loc117) + %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc118) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc119) + %w2_over_w_31 = arith.divf %arg14, %new_weight : f32 loc(#loc120) + %w2_over_w_32 = arith.select %w2_over_w, %cst, %w2_over_w_31 : f32 loc(#loc121) + %7 = arith.mulf %delta, %w2_over_w_32 : f32 loc(#loc122) + %8 = arith.addf %arg9, %7 : f32 loc(#loc123) + %9 = arith.addf %arg10, %arg13 : f32 loc(#loc124) + %10 = arith.mulf %delta, %delta : f32 loc(#loc125) + %11 = arith.mulf %10, %arg11 : f32 loc(#loc126) + %12 = arith.mulf %11, %w2_over_w_32 : f32 loc(#loc127) + %13 = arith.addf %9, %12 : f32 loc(#loc128) + tt.reduce.return %8, %13, %new_weight : f32, f32, f32 loc(#loc91) + }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc91) + %tmp7 = tt.expand_dims %3#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc97) + %tmp11 = tt.expand_dims %3#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc98) + %tmp13 = tt.load %1, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc99) + %tmp13_22 = arith.extf %tmp13 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc100) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc101) + %tmp23_23 = tt.addptr %tmp23, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc101) + %tmp23_24 = tt.load %tmp23_23, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc102) + %tmp23_25 = arith.extf %tmp23_24 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc103) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc104) + %tmp27_26 = tt.addptr %tmp27, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc104) + %tmp27_27 = tt.load %tmp27_26, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc105) + %tmp27_28 = arith.extf %tmp27_27 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc106) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc107) + %tmp15_29 = arith.subf %tmp13_22, %tmp15 : tensor<1x4096xf32> loc(#loc107) + %tmp17 = arith.divf %tmp11, %cst_4 : tensor<1x1xf32> loc(#loc108) + %tmp19 = arith.addf %tmp17, %cst_3 : tensor<1x1xf32> loc(#loc109) + %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc110) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc111) + %tmp21_30 = arith.mulf %tmp15_29, %tmp21 : tensor<1x4096xf32> loc(#loc111) + %tmp25 = arith.addf %tmp23_25, %cst_2 : tensor<1x4096xf32> loc(#loc112) + %tmp26 = arith.mulf %tmp21_30, %tmp25 : tensor<1x4096xf32> loc(#loc113) + %tmp28 = arith.addf %tmp26, %tmp27_28 : tensor<1x4096xf32> loc(#loc114) + %4 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc57) + %5 = tt.addptr %4, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc57) + %6 = arith.truncf %tmp28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc58) + tt.store %5, %6, %tmp0_13 : tensor<1x4096x!tt.ptr> loc(#loc58) + tt.return loc(#loc59) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:27) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4) +#loc69 = loc("xmask"(#loc2)) +#loc71 = loc("xoffset"(#loc4)) +#loc72 = loc("r0_base"(#loc5)) +#loc73 = loc("r0_base"(#loc6)) +#loc74 = loc("r0_mask"(#loc7)) +#loc75 = loc("tmp0"(#loc8)) +#loc76 = loc("tmp0"(#loc9)) +#loc77 = loc("tmp0"(#loc10)) +#loc78 = loc("tmp0"(#loc11)) +#loc79 = loc("tmp0"(#loc12)) +#loc80 = loc("tmp0"(#loc13)) +#loc81 = loc("tmp1"(#loc14)) +#loc82 = loc("tmp1"(#loc15)) +#loc83 = loc("tmp1"(#loc16)) +#loc84 = loc("tmp2"(#loc17)) +#loc85 = loc("tmp2"(#loc18)) +#loc86 = loc("tmp2"(#loc19)) +#loc87 = loc("tmp3"(#loc20)) +#loc88 = loc("tmp4"(#loc21)) +#loc89 = loc("tmp7_mean"(#loc22)) +#loc90 = loc("tmp7_weight"(#loc23)) +#loc91 = loc(callsite(#loc26 at #loc3)) +#loc92 = loc("delta"(#loc27)) +#loc93 = loc("new_weight"(#loc28)) +#loc94 = loc("w2_over_w"(#loc29)) +#loc95 = loc("w2_over_w"(#loc30)) +#loc96 = loc("w2_over_w"(#loc31)) +#loc97 = loc("tmp7"(#loc39)) +#loc98 = loc("tmp11"(#loc40)) +#loc99 = loc("tmp13"(#loc41)) +#loc100 = loc("tmp13"(#loc42)) +#loc101 = loc("tmp23"(#loc43)) +#loc102 = loc("tmp23"(#loc44)) +#loc103 = loc("tmp23"(#loc45)) +#loc104 = loc("tmp27"(#loc46)) +#loc105 = loc("tmp27"(#loc47)) +#loc106 = loc("tmp27"(#loc48)) +#loc107 = loc("tmp15"(#loc49)) +#loc108 = loc("tmp17"(#loc50)) +#loc109 = loc("tmp19"(#loc51)) +#loc110 = loc("tmp20"(#loc52)) +#loc111 = loc("tmp21"(#loc53)) +#loc112 = loc("tmp25"(#loc54)) +#loc113 = loc("tmp26"(#loc55)) +#loc114 = loc("tmp28"(#loc56)) +#loc115 = loc(fused[#loc76, #loc75]) +#loc116 = loc(fused[#loc78, #loc69]) +#loc117 = loc(callsite(#loc92 at #loc91)) +#loc118 = loc(callsite(#loc93 at #loc91)) +#loc119 = loc(callsite(#loc94 at #loc91)) +#loc120 = loc(callsite(#loc95 at #loc91)) +#loc121 = loc(callsite(#loc96 at #loc91)) +#loc122 = loc(callsite(#loc32 at #loc91)) +#loc123 = loc(callsite(#loc33 at #loc91)) +#loc124 = loc(callsite(#loc34 at #loc91)) +#loc125 = loc(callsite(#loc35 at #loc91)) +#loc126 = loc(callsite(#loc36 at #loc91)) +#loc127 = loc(callsite(#loc37 at #loc91)) +#loc128 = loc(callsite(#loc38 at #loc91)) diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/__grp__triton_poi_fused_cat_view_4.json b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/__grp__triton_poi_fused_cat_view_4.json new file mode 100644 index 0000000000000000000000000000000000000000..535fea929e6f108ff12a2b7229f6032435f2d312 --- /dev/null +++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/__grp__triton_poi_fused_cat_view_4.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_cat_view_4.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.source", "triton_poi_fused_cat_view_4.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttir", "triton_poi_fused_cat_view_4.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttgir", "triton_poi_fused_cat_view_4.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.llir", "triton_poi_fused_cat_view_4.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ptx", "triton_poi_fused_cat_view_4.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.cubin", "triton_poi_fused_cat_view_4.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.json"}} \ No newline at end of file diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.cubin b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.cubin new file mode 100644 index 0000000000000000000000000000000000000000..be3aa0a43b09568d5bcbcc9b3a8b0e4c7481840f Binary files /dev/null and b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.cubin differ diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.json b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1d554bf0116ae62fb21c0786ef83bb947682d328 --- /dev/null +++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.json @@ -0,0 +1 @@ +{"hash": "7fa39ebc053374648fb4e98226c5501eba23260cf80ea4c76849e04e26c0a273", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_cat_view_4"} \ No newline at end of file diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.llir b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.llir new file mode 100644 index 0000000000000000000000000000000000000000..09b1ccdff4ed4f4792a9c1c5857d337b2e3f1f2a --- /dev/null +++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.llir @@ -0,0 +1,78 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_cat_view_4(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 9, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = shl nuw nsw i32 %9, 1, !dbg !9 + %11 = and i32 %10, 510, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = sdiv i32 %12, 4096, !dbg !11 + %14 = icmp slt i32 %12, 1048576, !dbg !12 + %15 = shl i32 %13, 13, !dbg !13 + %16 = add i32 %15, %12, !dbg !13 + %17 = sext i32 %16 to i64, !dbg !14 + %18 = getelementptr bfloat, ptr addrspace(1) %0, i64 %17, !dbg !14 + %19 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %18, i1 %14) #2, !dbg !15 + %20 = bitcast i32 %19 to <2 x bfloat>, !dbg !15 + %21 = extractelement <2 x bfloat> %20, i64 0, !dbg !15 + %22 = extractelement <2 x bfloat> %20, i64 1, !dbg !15 + %23 = icmp sgt i32 %12, 1048575, !dbg !16 + %24 = add i32 %16, -3145728, !dbg !17 + %25 = sext i32 %24 to i64, !dbg !18 + %26 = getelementptr bfloat, ptr addrspace(1) %1, i64 %25, !dbg !18 + %27 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %26, i1 %23) #2, !dbg !19 + %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !19 + %29 = extractelement <2 x bfloat> %28, i64 0, !dbg !19 + %30 = extractelement <2 x bfloat> %28, i64 1, !dbg !19 + %.v = select i1 %14, bfloat %21, bfloat %29, !dbg !20 + %.v1 = select i1 %14, bfloat %22, bfloat %30, !dbg !20 + %31 = sext i32 %12 to i64, !dbg !21 + %32 = getelementptr bfloat, ptr addrspace(1) %2, i64 %31, !dbg !21 + %33 = insertelement <2 x bfloat> poison, bfloat %.v, i64 0, !dbg !22 + %34 = insertelement <2 x bfloat> %33, bfloat %.v1, i64 1, !dbg !22 + %35 = bitcast <2 x bfloat> %34 to i32, !dbg !22 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %35, ptr addrspace(1) %32) #2, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_cat_view_4", linkageName: "triton_poi_fused_cat_view_4", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 23, column: 19, scope: !4) +!12 = !DILocation(line: 30, column: 18, scope: !4) +!13 = !DILocation(line: 31, column: 35, scope: !4) +!14 = !DILocation(line: 31, column: 30, scope: !4) +!15 = !DILocation(line: 31, column: 48, scope: !4) +!16 = !DILocation(line: 32, column: 19, scope: !4) +!17 = !DILocation(line: 35, column: 35, scope: !4) +!18 = !DILocation(line: 35, column: 30, scope: !4) +!19 = !DILocation(line: 35, column: 57, scope: !4) +!20 = !DILocation(line: 36, column: 33, scope: !4) +!21 = !DILocation(line: 37, column: 25, scope: !4) +!22 = !DILocation(line: 37, column: 37, scope: !4) +!23 = !DILocation(line: 37, column: 4, scope: !4) diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ptx b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c299bd739e634461ae81517b38d84f70a3cbc859 --- /dev/null +++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ptx @@ -0,0 +1,333 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_cat_view_4 // -- Begin function triton_poi_fused_cat_view_4 + // @triton_poi_fused_cat_view_4 +.visible .entry triton_poi_fused_cat_view_4( + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_2, + .param .u32 triton_poi_fused_cat_view_4_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_5 +) +.reqntid 256 +{ + .reg .pred %p<3>; + .reg .b16 %rs<7>; + .reg .b32 %r<18>; + .reg .b64 %rd<7>; + .loc 1 18 0 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:18:0 +$L__func_begin0: + .loc 1 18 0 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_poi_fused_cat_view_4_param_0]; + ld.param.b64 %rd5, [triton_poi_fused_cat_view_4_param_1]; +$L__tmp0: + .loc 1 20 28 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:20:28 + mov.u32 %r5, %ctaid.x; + .loc 1 20 33 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:20:33 + shl.b32 %r6, %r5, 9; + ld.param.b64 %rd6, [triton_poi_fused_cat_view_4_param_2]; + .loc 1 21 36 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:21:36 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 1; + and.b32 %r9, %r8, 510; + .loc 1 21 23 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:21:23 + or.b32 %r10, %r9, %r6; + .loc 1 23 19 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:23:19 + bfe.s32 %r11, %r5, 22, 1; + shr.u32 %r12, %r11, 20; + add.s32 %r13, %r10, %r12; + .loc 1 30 18 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:30:18 + setp.lt.s32 %p1, %r10, 1048576; + .loc 1 31 35 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:35 + shl.b32 %r14, %r13, 1; + and.b32 %r15, %r14, -8192; + add.s32 %r16, %r15, %r10; + .loc 1 31 30 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:30 + mad.wide.s32 %rd1, %r16, 2, %rd4; + mov.b32 %r2, 0; + .loc 1 31 48 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:48 + // begin inline asm + mov.u32 %r1, %r2; + @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + .loc 1 32 19 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:32:19 + setp.gt.s32 %p2, %r10, 1048575; + .loc 1 35 35 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:35 + add.s32 %r17, %r16, -3145728; + .loc 1 35 30 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:30 + mad.wide.s32 %rd2, %r17, 2, %rd5; + .loc 1 35 57 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:57 + // begin inline asm + mov.u32 %r3, %r2; + @%p2 ld.global.b32 { %r3 }, [ %rd2 + 0 ]; + // end inline asm + mov.b32 {%rs3, %rs4}, %r3; + .loc 1 36 33 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:36:33 + selp.b16 %rs5, %rs1, %rs3, %p1; + selp.b16 %rs6, %rs2, %rs4, %p1; + .loc 1 37 25 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:25 + mad.wide.s32 %rd3, %r10, 2, %rd6; + .loc 1 37 37 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:37 + mov.b32 %r4, {%rs5, %rs6}; + // begin inline asm + st.global.b32 [ %rd3 + 0 ], { %r4 }; + // end inline asm + .loc 1 37 4 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 108 +.b8 112 +.b8 102 +.b8 52 +.b8 108 +.b8 111 +.b8 111 +.b8 104 +.b8 102 +.b8 115 +.b8 103 +.b8 119 +.b8 113 +.b8 104 +.b8 50 +.b8 103 +.b8 105 +.b8 50 +.b8 120 +.b8 111 +.b8 118 +.b8 111 +.b8 100 +.b8 112 +.b8 109 +.b8 55 +.b8 104 +.b8 122 +.b8 118 +.b8 53 +.b8 117 +.b8 50 +.b8 114 +.b8 118 +.b8 110 +.b8 103 +.b8 98 +.b8 55 +.b8 99 +.b8 104 +.b8 106 +.b8 103 +.b8 121 +.b8 119 +.b8 120 +.b8 53 +.b8 53 +.b8 103 +.b8 116 +.b8 117 +.b8 100 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 108 +.b8 112 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.source b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.source new file mode 100644 index 0000000000000000000000000000000000000000..4042d84ea3a4509075a3ebcc3579da142486ce37 --- /dev/null +++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.source @@ -0,0 +1,136 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0) +#loc31 = loc("in_ptr0"(#loc)) +#loc32 = loc("in_ptr1"(#loc)) +#loc33 = loc("out_ptr0"(#loc)) +#loc34 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc35) + %xoffset = tt.get_program_id x : i32 loc(#loc36) + %xoffset_1 = arith.constant 512 : i32 loc(#loc37) + %xoffset_2 = arith.constant 512 : i32 loc(#loc37) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc37) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc38) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc39) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc39) + %xmask = arith.constant true loc(#loc40) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc40) + %x1 = arith.constant 4096 : i32 loc(#loc41) + %x1_7 = arith.constant 4096 : i32 loc(#loc41) + %x1_8 = arith.constant dense<4096> : tensor<512xi32> loc(#loc41) + %x1_9 = arith.divsi %xindex_5, %x1_8 : tensor<512xi32> loc(#loc41) + %x0 = arith.constant 4096 : i32 loc(#loc42) + %x0_10 = arith.constant 4096 : i32 loc(#loc42) + %x0_11 = arith.constant dense<4096> : tensor<512xi32> loc(#loc42) + %x0_12 = arith.remsi %xindex_5, %x0_11 : tensor<512xi32> loc(#loc42) + %tmp1 = arith.constant 0 : i64 loc(#loc43) + %tmp1_13 = arith.constant dense<0> : tensor<1xi64> loc(#loc43) + %tmp2 = arith.extsi %x1_9 : tensor<512xi32> to tensor<512xi64> loc(#loc44) + %tmp2_14 = arith.constant dense<0> : tensor<512xi64> loc(#loc44) + %tmp2_15 = arith.cmpi sge, %tmp2, %tmp2_14 : tensor<512xi64> loc(#loc44) + %tmp3 = arith.constant 256 : i64 loc(#loc45) + %tmp3_16 = arith.constant dense<256> : tensor<1xi64> loc(#loc45) + %tmp4 = arith.extsi %x1_9 : tensor<512xi32> to tensor<512xi64> loc(#loc46) + %tmp4_17 = arith.constant dense<256> : tensor<512xi64> loc(#loc46) + %tmp4_18 = arith.cmpi slt, %tmp4, %tmp4_17 : tensor<512xi64> loc(#loc46) + %tmp5 = arith.constant 12288 : i32 loc(#loc47) + %tmp5_19 = arith.constant 12288 : i32 loc(#loc47) + %tmp5_20 = arith.constant dense<12288> : tensor<512xi32> loc(#loc47) + %tmp5_21 = arith.muli %tmp5_20, %x1_9 : tensor<512xi32> loc(#loc47) + %tmp5_22 = arith.addi %x0_12, %tmp5_21 : tensor<512xi32> loc(#loc48) + %tmp5_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc49) + %tmp5_24 = tt.addptr %tmp5_23, %tmp5_22 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc49) + %tmp5_25 = arith.constant 0.000000e+00 : f32 loc(#loc50) + %tmp5_26 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc50) + %tmp5_27 = arith.truncf %tmp5_26 : tensor<512xf32> to tensor<512xbf16> loc(#loc50) + %tmp5_28 = tt.load %tmp5_24, %tmp4_18, %tmp5_27 : tensor<512x!tt.ptr> loc(#loc50) + %tmp5_29 = arith.extf %tmp5_28 : tensor<512xbf16> to tensor<512xf32> loc(#loc51) + %tmp6 = arith.extsi %x1_9 : tensor<512xi32> to tensor<512xi64> loc(#loc52) + %tmp6_30 = arith.constant dense<256> : tensor<512xi64> loc(#loc52) + %tmp6_31 = arith.cmpi sge, %tmp6, %tmp6_30 : tensor<512xi64> loc(#loc52) + %tmp7 = arith.constant 2304 : i64 loc(#loc53) + %tmp7_32 = arith.constant dense<2304> : tensor<1xi64> loc(#loc53) + %tmp8 = arith.extsi %x1_9 : tensor<512xi32> to tensor<512xi64> loc(#loc54) + %tmp8_33 = arith.constant dense<2304> : tensor<512xi64> loc(#loc54) + %tmp8_34 = arith.cmpi slt, %tmp8, %tmp8_33 : tensor<512xi64> loc(#loc54) + %tmp9 = arith.constant -256 : i32 loc(#loc55) + %tmp9_35 = arith.constant -256 : i32 loc(#loc55) + %tmp9_36 = arith.constant dense<-256> : tensor<512xi32> loc(#loc55) + %tmp9_37 = arith.addi %tmp9_36, %x1_9 : tensor<512xi32> loc(#loc55) + %tmp9_38 = arith.constant 12288 : i32 loc(#loc56) + %tmp9_39 = arith.constant 12288 : i32 loc(#loc56) + %tmp9_40 = arith.constant dense<12288> : tensor<512xi32> loc(#loc56) + %tmp9_41 = arith.muli %tmp9_40, %tmp9_37 : tensor<512xi32> loc(#loc56) + %tmp9_42 = arith.addi %x0_12, %tmp9_41 : tensor<512xi32> loc(#loc57) + %tmp9_43 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc58) + %tmp9_44 = tt.addptr %tmp9_43, %tmp9_42 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc58) + %tmp9_45 = arith.constant 0.000000e+00 : f32 loc(#loc59) + %tmp9_46 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc59) + %tmp9_47 = arith.truncf %tmp9_46 : tensor<512xf32> to tensor<512xbf16> loc(#loc59) + %tmp9_48 = tt.load %tmp9_44, %tmp6_31, %tmp9_47 : tensor<512x!tt.ptr> loc(#loc59) + %tmp9_49 = arith.extf %tmp9_48 : tensor<512xbf16> to tensor<512xf32> loc(#loc60) + %tmp10 = arith.select %tmp4_18, %tmp5_29, %tmp9_49 : tensor<512xi1>, tensor<512xf32> loc(#loc61) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc28) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc28) + %2 = arith.truncf %tmp10 : tensor<512xf32> to tensor<512xbf16> loc(#loc29) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc29) + tt.return loc(#loc30) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":27:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":29:29) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":33:30) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":34:18) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4) +#loc35 = loc("xnumel"(#loc1)) +#loc36 = loc("xoffset"(#loc2)) +#loc37 = loc("xoffset"(#loc3)) +#loc38 = loc("xindex"(#loc4)) +#loc39 = loc("xindex"(#loc5)) +#loc40 = loc("xmask"(#loc6)) +#loc41 = loc("x1"(#loc7)) +#loc42 = loc("x0"(#loc8)) +#loc43 = loc("tmp1"(#loc9)) +#loc44 = loc("tmp2"(#loc10)) +#loc45 = loc("tmp3"(#loc11)) +#loc46 = loc("tmp4"(#loc12)) +#loc47 = loc("tmp5"(#loc13)) +#loc48 = loc("tmp5"(#loc14)) +#loc49 = loc("tmp5"(#loc15)) +#loc50 = loc("tmp5"(#loc16)) +#loc51 = loc("tmp5"(#loc17)) +#loc52 = loc("tmp6"(#loc18)) +#loc53 = loc("tmp7"(#loc19)) +#loc54 = loc("tmp8"(#loc20)) +#loc55 = loc("tmp9"(#loc21)) +#loc56 = loc("tmp9"(#loc22)) +#loc57 = loc("tmp9"(#loc23)) +#loc58 = loc("tmp9"(#loc24)) +#loc59 = loc("tmp9"(#loc25)) +#loc60 = loc("tmp9"(#loc26)) +#loc61 = loc("tmp10"(#loc27)) diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttgir b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..a81c383be57b5e9541991b93ba25615a10898d49 --- /dev/null +++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttgir @@ -0,0 +1,89 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0) +#loc25 = loc("in_ptr0"(#loc)) +#loc26 = loc("in_ptr1"(#loc)) +#loc27 = loc("out_ptr0"(#loc)) +#loc28 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<256> : tensor<512xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<512xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<-256> : tensor<512xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc29) + %xoffset_4 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc30) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc31) + %xindex_5 = tt.splat %xoffset_4 : i32 -> tensor<512xi32, #blocked> loc(#loc32) + %xindex_6 = arith.addi %xindex_5, %xindex : tensor<512xi32, #blocked> loc(#loc32) + %x1 = arith.divsi %xindex_6, %cst : tensor<512xi32, #blocked> loc(#loc33) + %x0 = arith.remsi %xindex_6, %cst : tensor<512xi32, #blocked> loc(#loc34) + %tmp4 = arith.extsi %x1 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc35) + %tmp4_7 = arith.cmpi slt, %tmp4, %cst_0 : tensor<512xi64, #blocked> loc(#loc35) + %tmp5 = arith.muli %x1, %cst_1 : tensor<512xi32, #blocked> loc(#loc36) + %tmp5_8 = arith.addi %x0, %tmp5 : tensor<512xi32, #blocked> loc(#loc37) + %tmp5_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc38) + %tmp5_10 = tt.addptr %tmp5_9, %tmp5_8 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc38) + %tmp5_11 = tt.load %tmp5_10, %tmp4_7, %cst_3 : tensor<512x!tt.ptr, #blocked> loc(#loc39) + %tmp5_12 = arith.extf %tmp5_11 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc40) + %tmp6 = arith.cmpi sge, %tmp4, %cst_0 : tensor<512xi64, #blocked> loc(#loc41) + %tmp9 = arith.addi %x1, %cst_2 : tensor<512xi32, #blocked> loc(#loc42) + %tmp9_13 = arith.muli %tmp9, %cst_1 : tensor<512xi32, #blocked> loc(#loc43) + %tmp9_14 = arith.addi %x0, %tmp9_13 : tensor<512xi32, #blocked> loc(#loc44) + %tmp9_15 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc45) + %tmp9_16 = tt.addptr %tmp9_15, %tmp9_14 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc45) + %tmp9_17 = tt.load %tmp9_16, %tmp6, %cst_3 : tensor<512x!tt.ptr, #blocked> loc(#loc46) + %tmp9_18 = arith.extf %tmp9_17 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc47) + %tmp10 = arith.select %tmp4_7, %tmp5_12, %tmp9_18 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc48) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc22) + %1 = tt.addptr %0, %xindex_6 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc22) + %2 = arith.truncf %tmp10 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc23) + tt.store %1, %2 : tensor<512x!tt.ptr, #blocked> loc(#loc23) + tt.return loc(#loc24) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4) +#loc29 = loc("xoffset"(#loc2)) +#loc30 = loc("xoffset"(#loc3)) +#loc31 = loc("xindex"(#loc4)) +#loc32 = loc("xindex"(#loc5)) +#loc33 = loc("x1"(#loc6)) +#loc34 = loc("x0"(#loc7)) +#loc35 = loc("tmp4"(#loc8)) +#loc36 = loc("tmp5"(#loc9)) +#loc37 = loc("tmp5"(#loc10)) +#loc38 = loc("tmp5"(#loc11)) +#loc39 = loc("tmp5"(#loc12)) +#loc40 = loc("tmp5"(#loc13)) +#loc41 = loc("tmp6"(#loc14)) +#loc42 = loc("tmp9"(#loc15)) +#loc43 = loc("tmp9"(#loc16)) +#loc44 = loc("tmp9"(#loc17)) +#loc45 = loc("tmp9"(#loc18)) +#loc46 = loc("tmp9"(#loc19)) +#loc47 = loc("tmp9"(#loc20)) +#loc48 = loc("tmp10"(#loc21)) diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttir b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttir new file mode 100644 index 0000000000000000000000000000000000000000..629683a28b165c750083b69450949845a7625d43 --- /dev/null +++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttir @@ -0,0 +1,88 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0) +#loc25 = loc("in_ptr0"(#loc)) +#loc26 = loc("in_ptr1"(#loc)) +#loc27 = loc("out_ptr0"(#loc)) +#loc28 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1) + %tmp9 = arith.constant dense<-256> : tensor<512xi32> loc(#loc29) + %cst_0 = arith.constant dense<12288> : tensor<512xi32> loc(#loc1) + %cst_1 = arith.constant dense<256> : tensor<512xi64> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<512xi32> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc30) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc31) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc32) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc33) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc33) + %x1 = arith.divsi %xindex_5, %cst_2 : tensor<512xi32> loc(#loc34) + %x0 = arith.remsi %xindex_5, %cst_2 : tensor<512xi32> loc(#loc35) + %tmp4 = arith.extsi %x1 : tensor<512xi32> to tensor<512xi64> loc(#loc36) + %tmp4_6 = arith.cmpi slt, %tmp4, %cst_1 : tensor<512xi64> loc(#loc36) + %tmp5 = arith.muli %x1, %cst_0 : tensor<512xi32> loc(#loc37) + %tmp5_7 = arith.addi %x0, %tmp5 : tensor<512xi32> loc(#loc38) + %tmp5_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc39) + %tmp5_9 = tt.addptr %tmp5_8, %tmp5_7 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc39) + %tmp5_10 = tt.load %tmp5_9, %tmp4_6, %cst : tensor<512x!tt.ptr> loc(#loc40) + %tmp5_11 = arith.extf %tmp5_10 : tensor<512xbf16> to tensor<512xf32> loc(#loc41) + %tmp6 = arith.cmpi sge, %tmp4, %cst_1 : tensor<512xi64> loc(#loc42) + %tmp9_12 = arith.addi %x1, %tmp9 : tensor<512xi32> loc(#loc29) + %tmp9_13 = arith.muli %tmp9_12, %cst_0 : tensor<512xi32> loc(#loc43) + %tmp9_14 = arith.addi %x0, %tmp9_13 : tensor<512xi32> loc(#loc44) + %tmp9_15 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc45) + %tmp9_16 = tt.addptr %tmp9_15, %tmp9_14 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc45) + %tmp9_17 = tt.load %tmp9_16, %tmp6, %cst : tensor<512x!tt.ptr> loc(#loc46) + %tmp9_18 = arith.extf %tmp9_17 : tensor<512xbf16> to tensor<512xf32> loc(#loc47) + %tmp10 = arith.select %tmp4_6, %tmp5_11, %tmp9_18 : tensor<512xi1>, tensor<512xf32> loc(#loc48) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc22) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc22) + %2 = arith.truncf %tmp10 : tensor<512xf32> to tensor<512xbf16> loc(#loc23) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc23) + tt.return loc(#loc24) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4) +#loc29 = loc("tmp9"(#loc2)) +#loc30 = loc("xoffset"(#loc3)) +#loc31 = loc("xoffset"(#loc4)) +#loc32 = loc("xindex"(#loc5)) +#loc33 = loc("xindex"(#loc6)) +#loc34 = loc("x1"(#loc7)) +#loc35 = loc("x0"(#loc8)) +#loc36 = loc("tmp4"(#loc9)) +#loc37 = loc("tmp5"(#loc10)) +#loc38 = loc("tmp5"(#loc11)) +#loc39 = loc("tmp5"(#loc12)) +#loc40 = loc("tmp5"(#loc13)) +#loc41 = loc("tmp5"(#loc14)) +#loc42 = loc("tmp6"(#loc15)) +#loc43 = loc("tmp9"(#loc16)) +#loc44 = loc("tmp9"(#loc17)) +#loc45 = loc("tmp9"(#loc18)) +#loc46 = loc("tmp9"(#loc19)) +#loc47 = loc("tmp9"(#loc20)) +#loc48 = loc("tmp10"(#loc21)) diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ed8c1a2d44f72bcc12c94509c45a6836d2ff86bf --- /dev/null +++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_cat_mul_silu_split_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.source", "triton_poi_fused_cat_mul_silu_split_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir", "triton_poi_fused_cat_mul_silu_split_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir", "triton_poi_fused_cat_mul_silu_split_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.llir", "triton_poi_fused_cat_mul_silu_split_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx", "triton_poi_fused_cat_mul_silu_split_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin", "triton_poi_fused_cat_mul_silu_split_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.json"}} \ No newline at end of file diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..ddde3eee4ae81fa5d8530a0ba2158aacbc3d80b0 Binary files /dev/null and b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin differ diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.json b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1d98d1988c3b378918cdcdff2bfd2e33f1b2c1d9 --- /dev/null +++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.json @@ -0,0 +1 @@ +{"hash": "7bdbc95616d50aaabed40fce2720f0625cdd901e7dfe546d0f608c07842e9b59", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_cat_mul_silu_split_view_0"} \ No newline at end of file diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.llir b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..8432e6158276ce4227f285f7a6b1da71b752fff3 --- /dev/null +++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.llir @@ -0,0 +1,130 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_cat_mul_silu_split_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 9, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = shl nuw nsw i32 %9, 1, !dbg !9 + %11 = and i32 %10, 510, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = sdiv i32 %12, 16384, !dbg !11 + %14 = mul i32 %13, 16384, !dbg !12 + %.decomposed = sub i32 %12, %14, !dbg !12 + %15 = icmp slt i32 %.decomposed, 4096, !dbg !13 + %16 = shl nsw i32 %13, 12, !dbg !14 + %17 = add nsw i32 %16, %.decomposed, !dbg !15 + %18 = sext i32 %17 to i64, !dbg !16 + %19 = getelementptr bfloat, ptr addrspace(1) %0, i64 %18, !dbg !16 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !17 + %21 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %19, i64 %20, i1 %15) #3, !dbg !17 + %22 = bitcast i32 %21 to <2 x bfloat>, !dbg !17 + %23 = icmp sgt i32 %.decomposed, 4095, !dbg !18 + %24 = mul i32 %13, 36864, !dbg !19 + %25 = add nsw i32 %.decomposed, -4096, !dbg !20 + %26 = add i32 %24, %25, !dbg !21 + %27 = sext i32 %26 to i64, !dbg !22 + %28 = getelementptr bfloat, ptr addrspace(1) %1, i64 %27, !dbg !22 + %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !23 + %30 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %28, i64 %29, i1 %23) #3, !dbg !23 + %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !23 + %32 = add i32 %26, 12288, !dbg !24 + %33 = sext i32 %32 to i64, !dbg !25 + %34 = getelementptr bfloat, ptr addrspace(1) %1, i64 %33, !dbg !25 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !26 + %36 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %34, i64 %35, i1 %23) #3, !dbg !26 + %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !26 + %38 = sext i32 %12 to i64, !dbg !27 + %39 = getelementptr bfloat, ptr addrspace(1) %2, i64 %38, !dbg !27 + %40 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !28 + %41 = extractelement <2 x float> %40, i64 0, !dbg !29 + %42 = fsub float 0.000000e+00, %41, !dbg !29 + %43 = extractelement <2 x float> %40, i64 1, !dbg !29 + %44 = fsub float 0.000000e+00, %43, !dbg !29 + %45 = fmul float %42, 0x3FF7154760000000, !dbg !34 + %46 = tail call float @llvm.nvvm.ex2.approx.f(float %45), !dbg !34 + %47 = fmul float %44, 0x3FF7154760000000, !dbg !34 + %48 = tail call float @llvm.nvvm.ex2.approx.f(float %47), !dbg !34 + %49 = fadd float %46, 1.000000e+00, !dbg !35 + %50 = fadd float %48, 1.000000e+00, !dbg !35 + %51 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %49), !dbg !36 + %52 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %50), !dbg !36 + %53 = insertelement <2 x float> poison, float %51, i64 0, !dbg !37 + %54 = insertelement <2 x float> %53, float %52, i64 1, !dbg !37 + %55 = fmul <2 x float> %54, %40, !dbg !37 + %56 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !38 + %57 = fmul <2 x float> %55, %56, !dbg !39 + %58 = fptrunc <2 x float> %57 to <2 x bfloat>, !dbg !40 + %59 = insertelement <2 x i1> poison, i1 %15, i64 0, !dbg !41 + %60 = shufflevector <2 x i1> %59, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !41 + %61 = select <2 x i1> %60, <2 x bfloat> %22, <2 x bfloat> %58, !dbg !41 + %62 = bitcast <2 x bfloat> %61 to i32, !dbg !40 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %62, ptr addrspace(1) %39) #3, !dbg !40 + ret void, !dbg !42 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_cat_mul_silu_split_view_0", linkageName: "triton_poi_fused_cat_mul_silu_split_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 19, scope: !4) +!12 = !DILocation(line: 23, column: 19, scope: !4) +!13 = !DILocation(line: 30, column: 18, scope: !4) +!14 = !DILocation(line: 31, column: 35, scope: !4) +!15 = !DILocation(line: 31, column: 41, scope: !4) +!16 = !DILocation(line: 31, column: 30, scope: !4) +!17 = !DILocation(line: 31, column: 47, scope: !4) +!18 = !DILocation(line: 32, column: 19, scope: !4) +!19 = !DILocation(line: 35, column: 36, scope: !4) +!20 = !DILocation(line: 35, column: 52, scope: !4) +!21 = !DILocation(line: 35, column: 42, scope: !4) +!22 = !DILocation(line: 35, column: 30, scope: !4) +!23 = !DILocation(line: 35, column: 58, scope: !4) +!24 = !DILocation(line: 40, column: 51, scope: !4) +!25 = !DILocation(line: 40, column: 31, scope: !4) +!26 = !DILocation(line: 40, column: 67, scope: !4) +!27 = !DILocation(line: 45, column: 25, scope: !4) +!28 = !DILocation(line: 35, column: 108, scope: !4) +!29 = !DILocation(line: 50, column: 30, scope: !30, inlinedAt: !32) +!30 = distinct !DILexicalBlockFile(scope: !4, file: !31, discriminator: 0) +!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!32 = !DILocation(line: 37, column: 23, scope: !33) +!33 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!34 = !DILocation(line: 50, column: 29, scope: !30, inlinedAt: !32) +!35 = !DILocation(line: 50, column: 20, scope: !30, inlinedAt: !32) +!36 = !DILocation(line: 50, column: 16, scope: !30, inlinedAt: !32) +!37 = !DILocation(line: 38, column: 20, scope: !4) +!38 = !DILocation(line: 40, column: 117, scope: !4) +!39 = !DILocation(line: 41, column: 20, scope: !4) +!40 = !DILocation(line: 45, column: 37, scope: !4) +!41 = !DILocation(line: 44, column: 33, scope: !4) +!42 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..3af8173d951718ac5a86e10ab68c1a35ae420b42 --- /dev/null +++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx @@ -0,0 +1,490 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_cat_mul_silu_split_view_0 // -- Begin function triton_poi_fused_cat_mul_silu_split_view_0 + // @triton_poi_fused_cat_mul_silu_split_view_0 +.visible .entry triton_poi_fused_cat_mul_silu_split_view_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_2, + .param .u32 triton_poi_fused_cat_mul_silu_split_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_5 +) +.reqntid 256 +{ + .reg .pred %p<3>; + .reg .b16 %rs<11>; + .reg .b32 %r<43>; + .reg .b64 %rd<11>; + .loc 1 18 0 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_poi_fused_cat_mul_silu_split_view_0_param_0]; + ld.param.b64 %rd9, [triton_poi_fused_cat_mul_silu_split_view_0_param_1]; +$L__tmp0: + .loc 1 20 28 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:20:28 + mov.u32 %r6, %ctaid.x; + .loc 1 20 33 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:20:33 + shl.b32 %r7, %r6, 9; + ld.param.b64 %rd10, [triton_poi_fused_cat_mul_silu_split_view_0_param_2]; + .loc 1 21 36 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:21:36 + mov.u32 %r8, %tid.x; + shl.b32 %r9, %r8, 1; + and.b32 %r10, %r9, 510; + .loc 1 21 23 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:21:23 + or.b32 %r11, %r10, %r7; + .loc 1 24 19 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:24:19 + bfe.s32 %r12, %r6, 22, 1; + shr.u32 %r13, %r12, 18; + add.s32 %r14, %r11, %r13; + shr.s32 %r15, %r14, 14; + .loc 1 23 19 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:23:19 + and.b32 %r16, %r14, -16384; + sub.s32 %r17, %r11, %r16; + .loc 1 30 18 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:30:18 + setp.lt.s32 %p1, %r17, 4096; + .loc 1 31 35 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:35 + shl.b32 %r18, %r15, 12; + .loc 1 31 41 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:41 + add.s32 %r19, %r18, %r17; + .loc 1 31 30 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:30 + mad.wide.s32 %rd1, %r19, 2, %rd8; + .loc 1 31 47 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:47 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r2, 0; + // begin inline asm + mov.u32 %r1, %r2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b32 { %r1 }, [ %rd1 + 0 ], %rd2; + // end inline asm + .loc 1 32 19 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:32:19 + setp.gt.s32 %p2, %r17, 4095; + .loc 1 35 52 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:52 + mad.lo.s32 %r20, %r15, 36864, %r17; + .loc 1 35 42 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:42 + add.s32 %r21, %r20, -4096; + .loc 1 35 30 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:30 + mad.wide.s32 %rd3, %r21, 2, %rd9; + .loc 1 35 58 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:58 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r3, %r2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r3 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 40 51 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:51 + add.s32 %r22, %r20, 8192; + .loc 1 40 31 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:31 + mad.wide.s32 %rd5, %r22, 2, %rd9; + .loc 1 40 67 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:67 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4, %r2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 45 25 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:25 + mad.wide.s32 %rd7, %r11, 2, %rd10; + .loc 1 35 108 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:108 + mov.b32 {%rs1, %rs2}, %r3; + cvt.f32.bf16 %r23, %rs2; + cvt.f32.bf16 %r24, %rs1; + mov.b32 %r25, 0f00000000; +$L__tmp1: + .loc 2 50 30 // standard.py:50:30 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + sub.f32 %r26, %r25, %r24; + sub.f32 %r27, %r25, %r23; + .loc 2 50 29 // standard.py:50:29 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + mul.f32 %r28, %r26, 0f3FB8AA3B; + ex2.approx.f32 %r29, %r28; + mul.f32 %r30, %r27, 0f3FB8AA3B; + ex2.approx.f32 %r31, %r30; + .loc 2 50 20 // standard.py:50:20 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + add.f32 %r32, %r29, 0f3F800000; + add.f32 %r33, %r31, 0f3F800000; + mov.b32 %r34, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ] + div.full.f32 %r35, %r34, %r32; + div.full.f32 %r36, %r34, %r33; +$L__tmp2: + .loc 1 38 20 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:38:20 + mul.f32 %r37, %r35, %r24; + mul.f32 %r38, %r36, %r23; + .loc 1 40 117 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:117 + mov.b32 {%rs3, %rs4}, %r4; + cvt.f32.bf16 %r39, %rs3; + cvt.f32.bf16 %r40, %rs4; + .loc 1 41 20 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:41:20 + mul.f32 %r41, %r38, %r40; + mul.f32 %r42, %r37, %r39; + .loc 1 45 37 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37 + cvt.rn.bf16.f32 %rs5, %r42; + cvt.rn.bf16.f32 %rs6, %r41; + .loc 1 44 33 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:44:33 + mov.b32 {%rs7, %rs8}, %r1; + selp.b16 %rs9, %rs8, %rs6, %p1; + selp.b16 %rs10, %rs7, %rs5, %p1; + mov.b32 %r5, {%rs10, %rs9}; + .loc 1 45 37 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37 + // begin inline asm + st.global.b32 [ %rd7 + 0 ], { %r5 }; + // end inline asm + .loc 1 45 4 // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 316 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x135 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 51 +.b8 105 +.b8 109 +.b8 121 +.b8 102 +.b8 105 +.b8 98 +.b8 99 +.b8 113 +.b8 51 +.b8 122 +.b8 119 +.b8 114 +.b8 99 +.b8 53 +.b8 103 +.b8 118 +.b8 102 +.b8 115 +.b8 99 +.b8 118 +.b8 112 +.b8 115 +.b8 97 +.b8 120 +.b8 100 +.b8 122 +.b8 106 +.b8 105 +.b8 106 +.b8 121 +.b8 109 +.b8 114 +.b8 110 +.b8 116 +.b8 50 +.b8 108 +.b8 102 +.b8 97 +.b8 104 +.b8 116 +.b8 114 +.b8 106 +.b8 109 +.b8 114 +.b8 98 +.b8 116 +.b8 108 +.b8 109 +.b8 104 +.b8 101 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 51 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 111 +.b8 105 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 116 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 105 +.b8 108 +.b8 117 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x111:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x126:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 23 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.source b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..5630dd3cd3cb1e3f423a83a7709bef62f41aca7c --- /dev/null +++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.source @@ -0,0 +1,212 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0) +#loc43 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("in_ptr1"(#loc)) +#loc52 = loc("out_ptr0"(#loc)) +#loc53 = loc("xnumel"(#loc)) +#loc93 = loc("x"(#loc43)) +module { + tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 37748736 : i32 loc(#loc54) + %xoffset = tt.get_program_id x : i32 loc(#loc55) + %xoffset_1 = arith.constant 512 : i32 loc(#loc56) + %xoffset_2 = arith.constant 512 : i32 loc(#loc56) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc56) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc57) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc58) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc58) + %xmask = arith.constant true loc(#loc59) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc59) + %x0 = arith.constant 16384 : i32 loc(#loc60) + %x0_7 = arith.constant 16384 : i32 loc(#loc60) + %x0_8 = arith.constant dense<16384> : tensor<512xi32> loc(#loc60) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc60) + %x1 = arith.constant 16384 : i32 loc(#loc61) + %x1_10 = arith.constant 16384 : i32 loc(#loc61) + %x1_11 = arith.constant dense<16384> : tensor<512xi32> loc(#loc61) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc61) + %tmp1 = arith.constant 0 : i64 loc(#loc62) + %tmp1_13 = arith.constant dense<0> : tensor<1xi64> loc(#loc62) + %tmp2 = arith.extsi %x0_9 : tensor<512xi32> to tensor<512xi64> loc(#loc63) + %tmp2_14 = arith.constant dense<0> : tensor<512xi64> loc(#loc63) + %tmp2_15 = arith.cmpi sge, %tmp2, %tmp2_14 : tensor<512xi64> loc(#loc63) + %tmp3 = arith.constant 4096 : i64 loc(#loc64) + %tmp3_16 = arith.constant dense<4096> : tensor<1xi64> loc(#loc64) + %tmp4 = arith.extsi %x0_9 : tensor<512xi32> to tensor<512xi64> loc(#loc65) + %tmp4_17 = arith.constant dense<4096> : tensor<512xi64> loc(#loc65) + %tmp4_18 = arith.cmpi slt, %tmp4, %tmp4_17 : tensor<512xi64> loc(#loc65) + %tmp5 = arith.constant 4096 : i32 loc(#loc66) + %tmp5_19 = arith.constant 4096 : i32 loc(#loc66) + %tmp5_20 = arith.constant dense<4096> : tensor<512xi32> loc(#loc66) + %tmp5_21 = arith.muli %tmp5_20, %x1_12 : tensor<512xi32> loc(#loc66) + %tmp5_22 = arith.addi %tmp5_21, %x0_9 : tensor<512xi32> loc(#loc67) + %tmp5_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc68) + %tmp5_24 = tt.addptr %tmp5_23, %tmp5_22 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc68) + %tmp5_25 = arith.constant 0.000000e+00 : f32 loc(#loc69) + %tmp5_26 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc69) + %tmp5_27 = arith.truncf %tmp5_26 : tensor<512xf32> to tensor<512xbf16> loc(#loc69) + %tmp5_28 = tt.load %tmp5_24, %tmp4_18, %tmp5_27 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc69) + %tmp5_29 = arith.extf %tmp5_28 : tensor<512xbf16> to tensor<512xf32> loc(#loc70) + %tmp6 = arith.extsi %x0_9 : tensor<512xi32> to tensor<512xi64> loc(#loc71) + %tmp6_30 = arith.constant dense<4096> : tensor<512xi64> loc(#loc71) + %tmp6_31 = arith.cmpi sge, %tmp6, %tmp6_30 : tensor<512xi64> loc(#loc71) + %tmp7 = arith.constant 16384 : i64 loc(#loc72) + %tmp7_32 = arith.constant dense<16384> : tensor<1xi64> loc(#loc72) + %tmp8 = arith.extsi %x0_9 : tensor<512xi32> to tensor<512xi64> loc(#loc73) + %tmp8_33 = arith.constant dense<16384> : tensor<512xi64> loc(#loc73) + %tmp8_34 = arith.cmpi slt, %tmp8, %tmp8_33 : tensor<512xi64> loc(#loc73) + %tmp9 = arith.constant 36864 : i32 loc(#loc74) + %tmp9_35 = arith.constant 36864 : i32 loc(#loc74) + %tmp9_36 = arith.constant dense<36864> : tensor<512xi32> loc(#loc74) + %tmp9_37 = arith.muli %tmp9_36, %x1_12 : tensor<512xi32> loc(#loc74) + %tmp9_38 = arith.constant -4096 : i32 loc(#loc75) + %tmp9_39 = arith.constant -4096 : i32 loc(#loc75) + %tmp9_40 = arith.constant dense<-4096> : tensor<512xi32> loc(#loc75) + %tmp9_41 = arith.addi %tmp9_40, %x0_9 : tensor<512xi32> loc(#loc75) + %tmp9_42 = arith.addi %tmp9_37, %tmp9_41 : tensor<512xi32> loc(#loc76) + %tmp9_43 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc77) + %tmp9_44 = tt.addptr %tmp9_43, %tmp9_42 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc77) + %tmp9_45 = arith.constant 0.000000e+00 : f32 loc(#loc78) + %tmp9_46 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc78) + %tmp9_47 = arith.truncf %tmp9_46 : tensor<512xf32> to tensor<512xbf16> loc(#loc78) + %tmp9_48 = tt.load %tmp9_44, %tmp6_31, %tmp9_47 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc78) + %tmp9_49 = arith.extf %tmp9_48 : tensor<512xbf16> to tensor<512xf32> loc(#loc79) + %tmp11 = tt.call @triton.language.standard.sigmoid__fp32S512S__(%tmp9_49) : (tensor<512xf32>) -> tensor<512xf32> loc(#loc80) + %tmp12 = arith.mulf %tmp9_49, %tmp11 : tensor<512xf32> loc(#loc81) + %tmp14 = arith.constant 36864 : i32 loc(#loc82) + %tmp14_50 = arith.constant 36864 : i32 loc(#loc82) + %tmp14_51 = arith.constant dense<36864> : tensor<512xi32> loc(#loc82) + %tmp14_52 = arith.muli %tmp14_51, %x1_12 : tensor<512xi32> loc(#loc82) + %tmp14_53 = arith.constant 12288 : i32 loc(#loc83) + %tmp14_54 = arith.constant 12288 : i32 loc(#loc83) + %tmp14_55 = arith.constant dense<12288> : tensor<512xi32> loc(#loc83) + %tmp14_56 = arith.addi %tmp14_55, %tmp14_52 : tensor<512xi32> loc(#loc83) + %tmp14_57 = arith.constant -4096 : i32 loc(#loc84) + %tmp14_58 = arith.constant -4096 : i32 loc(#loc84) + %tmp14_59 = arith.constant dense<-4096> : tensor<512xi32> loc(#loc84) + %tmp14_60 = arith.addi %tmp14_59, %x0_9 : tensor<512xi32> loc(#loc84) + %tmp14_61 = arith.addi %tmp14_56, %tmp14_60 : tensor<512xi32> loc(#loc85) + %tmp14_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc86) + %tmp14_63 = tt.addptr %tmp14_62, %tmp14_61 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc86) + %tmp14_64 = arith.constant 0.000000e+00 : f32 loc(#loc87) + %tmp14_65 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc87) + %tmp14_66 = arith.truncf %tmp14_65 : tensor<512xf32> to tensor<512xbf16> loc(#loc87) + %tmp14_67 = tt.load %tmp14_63, %tmp6_31, %tmp14_66 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc87) + %tmp14_68 = arith.extf %tmp14_67 : tensor<512xbf16> to tensor<512xf32> loc(#loc88) + %tmp15 = arith.mulf %tmp12, %tmp14_68 : tensor<512xf32> loc(#loc89) + %tmp16 = arith.constant 0.000000e+00 : f32 loc(#loc90) + %tmp16_69 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc90) + %tmp17 = arith.select %tmp6_31, %tmp15, %tmp16_69 : tensor<512xi1>, tensor<512xf32> loc(#loc91) + %tmp18 = arith.select %tmp4_18, %tmp5_29, %tmp17 : tensor<512xi1>, tensor<512xf32> loc(#loc92) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc40) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc40) + %2 = arith.truncf %tmp18 : tensor<512xf32> to tensor<512xbf16> loc(#loc41) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc41) + tt.return loc(#loc42) + } loc(#loc) + tt.func private @triton.language.standard.sigmoid__fp32S512S__(%x: tensor<512xf32> loc("x"(#loc43))) -> tensor<512xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc44) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc44) + %0 = arith.subf %cst_0, %x : tensor<512xf32> loc(#loc44) + %1 = math.exp %0 : tensor<512xf32> loc(#loc45) + %c1_i32 = arith.constant 1 : i32 loc(#loc46) + %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc46) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc46) + %2 = arith.addf %cst_2, %1 : tensor<512xf32> loc(#loc46) + %c1_i32_3 = arith.constant 1 : i32 loc(#loc47) + %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc47) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc47) + %3 = arith.divf %cst_5, %2 : tensor<512xf32> loc(#loc47) + tt.return %3 : tensor<512xf32> loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : tensor<512xf32> loc(#loc49) + tt.return %4 : tensor<512xf32> loc(#loc49) + } loc(#loc43) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":27:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":29:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":33:31) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":34:18) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:45) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:61) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":42:38) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4) +#loc44 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc45 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc46 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc47 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc48 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11) +#loc49 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4) +#loc54 = loc("xnumel"(#loc1)) +#loc55 = loc("xoffset"(#loc2)) +#loc56 = loc("xoffset"(#loc3)) +#loc57 = loc("xindex"(#loc4)) +#loc58 = loc("xindex"(#loc5)) +#loc59 = loc("xmask"(#loc6)) +#loc60 = loc("x0"(#loc7)) +#loc61 = loc("x1"(#loc8)) +#loc62 = loc("tmp1"(#loc9)) +#loc63 = loc("tmp2"(#loc10)) +#loc64 = loc("tmp3"(#loc11)) +#loc65 = loc("tmp4"(#loc12)) +#loc66 = loc("tmp5"(#loc13)) +#loc67 = loc("tmp5"(#loc14)) +#loc68 = loc("tmp5"(#loc15)) +#loc69 = loc("tmp5"(#loc16)) +#loc70 = loc("tmp5"(#loc17)) +#loc71 = loc("tmp6"(#loc18)) +#loc72 = loc("tmp7"(#loc19)) +#loc73 = loc("tmp8"(#loc20)) +#loc74 = loc("tmp9"(#loc21)) +#loc75 = loc("tmp9"(#loc22)) +#loc76 = loc("tmp9"(#loc23)) +#loc77 = loc("tmp9"(#loc24)) +#loc78 = loc("tmp9"(#loc25)) +#loc79 = loc("tmp9"(#loc26)) +#loc80 = loc("tmp11"(#loc27)) +#loc81 = loc("tmp12"(#loc28)) +#loc82 = loc("tmp14"(#loc29)) +#loc83 = loc("tmp14"(#loc30)) +#loc84 = loc("tmp14"(#loc31)) +#loc85 = loc("tmp14"(#loc32)) +#loc86 = loc("tmp14"(#loc33)) +#loc87 = loc("tmp14"(#loc34)) +#loc88 = loc("tmp14"(#loc35)) +#loc89 = loc("tmp15"(#loc36)) +#loc90 = loc("tmp16"(#loc37)) +#loc91 = loc("tmp17"(#loc38)) +#loc92 = loc("tmp18"(#loc39)) diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2e44a0a90fa098dc93b4ab679a3fa89613988448 --- /dev/null +++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir @@ -0,0 +1,131 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0) +#loc38 = loc("in_ptr0"(#loc)) +#loc39 = loc("in_ptr1"(#loc)) +#loc40 = loc("out_ptr0"(#loc)) +#loc41 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<12288> : tensor<512xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<-4096> : tensor<512xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<36864> : tensor<512xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<4096> : tensor<512xi64, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<16384> : tensor<512xi32, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc42) + %xoffset_8 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc43) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc44) + %xindex_9 = tt.splat %xoffset_8 : i32 -> tensor<512xi32, #blocked> loc(#loc45) + %xindex_10 = arith.addi %xindex_9, %xindex : tensor<512xi32, #blocked> loc(#loc45) + %x0 = arith.remsi %xindex_10, %cst_4 : tensor<512xi32, #blocked> loc(#loc46) + %x1 = arith.divsi %xindex_10, %cst_4 : tensor<512xi32, #blocked> loc(#loc47) + %tmp4 = arith.extsi %x0 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc48) + %tmp4_11 = arith.cmpi slt, %tmp4, %cst_3 : tensor<512xi64, #blocked> loc(#loc48) + %tmp5 = arith.muli %x1, %cst_2 : tensor<512xi32, #blocked> loc(#loc49) + %tmp5_12 = arith.addi %tmp5, %x0 : tensor<512xi32, #blocked> loc(#loc50) + %tmp5_13 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc51) + %tmp5_14 = tt.addptr %tmp5_13, %tmp5_12 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc51) + %tmp5_15 = tt.load %tmp5_14, %tmp4_11, %cst_5 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc52) + %tmp5_16 = arith.extf %tmp5_15 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc53) + %tmp6 = arith.cmpi sge, %tmp4, %cst_3 : tensor<512xi64, #blocked> loc(#loc54) + %tmp9 = arith.muli %x1, %cst_1 : tensor<512xi32, #blocked> loc(#loc55) + %tmp9_17 = arith.addi %x0, %cst_0 : tensor<512xi32, #blocked> loc(#loc56) + %tmp9_18 = arith.addi %tmp9, %tmp9_17 : tensor<512xi32, #blocked> loc(#loc57) + %tmp9_19 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc58) + %tmp9_20 = tt.addptr %tmp9_19, %tmp9_18 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc58) + %tmp9_21 = tt.load %tmp9_20, %tmp6, %cst_5 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc59) + %tmp9_22 = arith.extf %tmp9_21 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc60) + %tmp11 = arith.subf %cst_7, %tmp9_22 : tensor<512xf32, #blocked> loc(#loc71) + %tmp11_23 = math.exp %tmp11 : tensor<512xf32, #blocked> loc(#loc72) + %tmp11_24 = arith.addf %tmp11_23, %cst_6 : tensor<512xf32, #blocked> loc(#loc73) + %tmp11_25 = arith.divf %cst_6, %tmp11_24 : tensor<512xf32, #blocked> loc(#loc74) + %tmp12 = arith.mulf %tmp9_22, %tmp11_25 : tensor<512xf32, #blocked> loc(#loc62) + %tmp14 = arith.addi %tmp9, %cst : tensor<512xi32, #blocked> loc(#loc63) + %tmp14_26 = arith.addi %tmp14, %tmp9_17 : tensor<512xi32, #blocked> loc(#loc64) + %tmp14_27 = tt.addptr %tmp9_19, %tmp14_26 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc65) + %tmp14_28 = tt.load %tmp14_27, %tmp6, %cst_5 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc66) + %tmp14_29 = arith.extf %tmp14_28 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc67) + %tmp15 = arith.mulf %tmp12, %tmp14_29 : tensor<512xf32, #blocked> loc(#loc68) + %tmp17 = arith.select %tmp6, %tmp15, %cst_7 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc69) + %tmp18 = arith.select %tmp4_11, %tmp5_16, %tmp17 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc70) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc35) + %1 = tt.addptr %0, %xindex_10 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc35) + %2 = arith.truncf %tmp18 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc36) + tt.store %1, %2 : tensor<512x!tt.ptr, #blocked> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4) +#loc42 = loc("xoffset"(#loc2)) +#loc43 = loc("xoffset"(#loc3)) +#loc44 = loc("xindex"(#loc4)) +#loc45 = loc("xindex"(#loc5)) +#loc46 = loc("x0"(#loc6)) +#loc47 = loc("x1"(#loc7)) +#loc48 = loc("tmp4"(#loc8)) +#loc49 = loc("tmp5"(#loc9)) +#loc50 = loc("tmp5"(#loc10)) +#loc51 = loc("tmp5"(#loc11)) +#loc52 = loc("tmp5"(#loc12)) +#loc53 = loc("tmp5"(#loc13)) +#loc54 = loc("tmp6"(#loc14)) +#loc55 = loc("tmp9"(#loc15)) +#loc56 = loc("tmp9"(#loc16)) +#loc57 = loc("tmp9"(#loc17)) +#loc58 = loc("tmp9"(#loc18)) +#loc59 = loc("tmp9"(#loc19)) +#loc60 = loc("tmp9"(#loc20)) +#loc61 = loc("tmp11"(#loc22)) +#loc62 = loc("tmp12"(#loc26)) +#loc63 = loc("tmp14"(#loc27)) +#loc64 = loc("tmp14"(#loc28)) +#loc65 = loc("tmp14"(#loc29)) +#loc66 = loc("tmp14"(#loc30)) +#loc67 = loc("tmp14"(#loc31)) +#loc68 = loc("tmp15"(#loc32)) +#loc69 = loc("tmp17"(#loc33)) +#loc70 = loc("tmp18"(#loc34)) +#loc71 = loc(callsite(#loc21 at #loc61)) +#loc72 = loc(callsite(#loc23 at #loc61)) +#loc73 = loc(callsite(#loc24 at #loc61)) +#loc74 = loc(callsite(#loc25 at #loc61)) diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..00ba1b95891e8658f6973f6000a7e178586ec370 --- /dev/null +++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir @@ -0,0 +1,131 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0) +#loc38 = loc("in_ptr0"(#loc)) +#loc39 = loc("in_ptr1"(#loc)) +#loc40 = loc("out_ptr0"(#loc)) +#loc41 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp11 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc71) + %cst = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1) + %tmp14 = arith.constant dense<12288> : tensor<512xi32> loc(#loc43) + %cst_0 = arith.constant dense<-4096> : tensor<512xi32> loc(#loc1) + %cst_1 = arith.constant dense<36864> : tensor<512xi32> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1) + %tmp5 = arith.constant dense<4096> : tensor<512xi32> loc(#loc44) + %cst_3 = arith.constant dense<4096> : tensor<512xi64> loc(#loc1) + %cst_4 = arith.constant dense<16384> : tensor<512xi32> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc45) + %xoffset_5 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc46) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc47) + %xindex_6 = tt.splat %xoffset_5 : i32 -> tensor<512xi32> loc(#loc48) + %xindex_7 = arith.addi %xindex_6, %xindex : tensor<512xi32> loc(#loc48) + %x0 = arith.remsi %xindex_7, %cst_4 : tensor<512xi32> loc(#loc49) + %x1 = arith.divsi %xindex_7, %cst_4 : tensor<512xi32> loc(#loc50) + %tmp4 = arith.extsi %x0 : tensor<512xi32> to tensor<512xi64> loc(#loc51) + %tmp4_8 = arith.cmpi slt, %tmp4, %cst_3 : tensor<512xi64> loc(#loc51) + %tmp5_9 = arith.muli %x1, %tmp5 : tensor<512xi32> loc(#loc44) + %tmp5_10 = arith.addi %tmp5_9, %x0 : tensor<512xi32> loc(#loc52) + %tmp5_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc53) + %tmp5_12 = tt.addptr %tmp5_11, %tmp5_10 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc53) + %tmp5_13 = tt.load %tmp5_12, %tmp4_8, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc54) + %tmp5_14 = arith.extf %tmp5_13 : tensor<512xbf16> to tensor<512xf32> loc(#loc55) + %tmp6 = arith.cmpi sge, %tmp4, %cst_3 : tensor<512xi64> loc(#loc56) + %tmp9 = arith.muli %x1, %cst_1 : tensor<512xi32> loc(#loc57) + %tmp9_15 = arith.addi %x0, %cst_0 : tensor<512xi32> loc(#loc58) + %tmp9_16 = arith.addi %tmp9, %tmp9_15 : tensor<512xi32> loc(#loc59) + %tmp9_17 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc60) + %tmp9_18 = tt.addptr %tmp9_17, %tmp9_16 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc60) + %tmp9_19 = tt.load %tmp9_18, %tmp6, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc61) + %tmp9_20 = arith.extf %tmp9_19 : tensor<512xbf16> to tensor<512xf32> loc(#loc62) + %tmp11_21 = arith.subf %cst_2, %tmp9_20 : tensor<512xf32> loc(#loc72) + %tmp11_22 = math.exp %tmp11_21 : tensor<512xf32> loc(#loc73) + %tmp11_23 = arith.addf %tmp11_22, %tmp11 : tensor<512xf32> loc(#loc74) + %tmp11_24 = arith.divf %tmp11, %tmp11_23 : tensor<512xf32> loc(#loc75) + %tmp12 = arith.mulf %tmp9_20, %tmp11_24 : tensor<512xf32> loc(#loc63) + %tmp14_25 = arith.addi %tmp9, %tmp14 : tensor<512xi32> loc(#loc43) + %tmp14_26 = arith.addi %tmp14_25, %tmp9_15 : tensor<512xi32> loc(#loc64) + %tmp14_27 = tt.addptr %tmp9_17, %tmp14_26 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc65) + %tmp14_28 = tt.load %tmp14_27, %tmp6, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc66) + %tmp14_29 = arith.extf %tmp14_28 : tensor<512xbf16> to tensor<512xf32> loc(#loc67) + %tmp15 = arith.mulf %tmp12, %tmp14_29 : tensor<512xf32> loc(#loc68) + %tmp17 = arith.select %tmp6, %tmp15, %cst_2 : tensor<512xi1>, tensor<512xf32> loc(#loc69) + %tmp18 = arith.select %tmp4_8, %tmp5_14, %tmp17 : tensor<512xi1>, tensor<512xf32> loc(#loc70) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc35) + %1 = tt.addptr %0, %xindex_7 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc35) + %2 = arith.truncf %tmp18 : tensor<512xf32> to tensor<512xbf16> loc(#loc36) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4) +#loc42 = loc("tmp11"(#loc2)) +#loc43 = loc("tmp14"(#loc3)) +#loc44 = loc("tmp5"(#loc4)) +#loc45 = loc("xoffset"(#loc5)) +#loc46 = loc("xoffset"(#loc6)) +#loc47 = loc("xindex"(#loc7)) +#loc48 = loc("xindex"(#loc8)) +#loc49 = loc("x0"(#loc9)) +#loc50 = loc("x1"(#loc10)) +#loc51 = loc("tmp4"(#loc11)) +#loc52 = loc("tmp5"(#loc12)) +#loc53 = loc("tmp5"(#loc13)) +#loc54 = loc("tmp5"(#loc14)) +#loc55 = loc("tmp5"(#loc15)) +#loc56 = loc("tmp6"(#loc16)) +#loc57 = loc("tmp9"(#loc17)) +#loc58 = loc("tmp9"(#loc18)) +#loc59 = loc("tmp9"(#loc19)) +#loc60 = loc("tmp9"(#loc20)) +#loc61 = loc("tmp9"(#loc21)) +#loc62 = loc("tmp9"(#loc22)) +#loc63 = loc("tmp12"(#loc27)) +#loc64 = loc("tmp14"(#loc28)) +#loc65 = loc("tmp14"(#loc29)) +#loc66 = loc("tmp14"(#loc30)) +#loc67 = loc("tmp14"(#loc31)) +#loc68 = loc("tmp15"(#loc32)) +#loc69 = loc("tmp17"(#loc33)) +#loc70 = loc("tmp18"(#loc34)) +#loc71 = loc(callsite(#loc1 at #loc42)) +#loc72 = loc(callsite(#loc23 at #loc42)) +#loc73 = loc(callsite(#loc24 at #loc42)) +#loc74 = loc(callsite(#loc25 at #loc42)) +#loc75 = loc(callsite(#loc26 at #loc42)) diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8eb4e3e423032ffa52dd92e1fbf9861e346160e0 --- /dev/null +++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json"}} \ No newline at end of file diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..55293d77a04b6fac639d79ab1f1a84c382320c33 Binary files /dev/null and b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin differ diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9061cb26f26e5c0fc6a971a0fc0dc63f2489555a --- /dev/null +++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json @@ -0,0 +1 @@ +{"hash": "7c99f681a5ad883c51f7550cebf0d62d696f661722eb7674dd5b2de8d917889d", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3"} \ No newline at end of file diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..9a16126e00981226fe93ad6fe3acb0ee26d9494b --- /dev/null +++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir @@ -0,0 +1,493 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %11 = shl i32 %10, 10, !dbg !8 + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %13 = shl nuw nsw i32 %12, 3, !dbg !9 + %14 = and i32 %13, 1016, !dbg !9 + %15 = or disjoint i32 %14, %11, !dbg !10 + %16 = or i32 %11, %13, !dbg !9 + %17 = or disjoint i32 %16, 2, !dbg !10 + %18 = or disjoint i32 %16, 4, !dbg !10 + %19 = or disjoint i32 %16, 6, !dbg !10 + %20 = sdiv i32 %15, 128, !dbg !11 + %21 = mul i32 %20, 128, !dbg !12 + %.decomposed = sub i32 %15, %21, !dbg !12 + %22 = srem i32 %17, 128, !dbg !12 + %23 = srem i32 %18, 128, !dbg !12 + %24 = srem i32 %19, 128, !dbg !12 + %25 = sdiv i32 %15, 4096, !dbg !13 + %26 = sext i32 %15 to i64, !dbg !14 + %27 = getelementptr bfloat, ptr addrspace(1) %0, i64 %26, !dbg !14 + %28 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %27) #2, !dbg !15 + %29 = extractvalue { i32, i32, i32, i32 } %28, 0, !dbg !15 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !15 + %31 = extractvalue { i32, i32, i32, i32 } %28, 1, !dbg !15 + %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !15 + %33 = extractvalue { i32, i32, i32, i32 } %28, 2, !dbg !15 + %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !15 + %35 = extractvalue { i32, i32, i32, i32 } %28, 3, !dbg !15 + %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !15 + %37 = shl nsw i32 %25, 7, !dbg !16 + %38 = add nsw i32 %37, %.decomposed, !dbg !17 + %39 = add nsw i32 %37, %23, !dbg !17 + %40 = sext i32 %38 to i64, !dbg !18 + %41 = getelementptr float, ptr addrspace(1) %1, i64 %40, !dbg !18 + %42 = sext i32 %39 to i64, !dbg !18 + %43 = getelementptr float, ptr addrspace(1) %1, i64 %42, !dbg !18 + %44 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !19 + %45 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %41, i64 %44) #2, !dbg !19 + %46 = extractvalue { i32, i32, i32, i32 } %45, 0, !dbg !19 + %47 = extractvalue { i32, i32, i32, i32 } %45, 1, !dbg !19 + %48 = extractvalue { i32, i32, i32, i32 } %45, 2, !dbg !19 + %49 = extractvalue { i32, i32, i32, i32 } %45, 3, !dbg !19 + %50 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !19 + %51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %43, i64 %50) #2, !dbg !19 + %52 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !19 + %53 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !19 + %54 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !19 + %55 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !19 + %56 = getelementptr float, ptr addrspace(1) %2, i64 %40, !dbg !20 + %57 = getelementptr float, ptr addrspace(1) %2, i64 %42, !dbg !20 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !21 + %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %56, i64 %58) #2, !dbg !21 + %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !21 + %61 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !21 + %62 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !21 + %63 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !21 + %64 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !21 + %65 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %57, i64 %64) #2, !dbg !21 + %66 = extractvalue { i32, i32, i32, i32 } %65, 0, !dbg !21 + %67 = extractvalue { i32, i32, i32, i32 } %65, 1, !dbg !21 + %68 = extractvalue { i32, i32, i32, i32 } %65, 2, !dbg !21 + %69 = extractvalue { i32, i32, i32, i32 } %65, 3, !dbg !21 + %70 = getelementptr bfloat, ptr addrspace(1) %3, i64 %26, !dbg !22 + %71 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %70) #2, !dbg !23 + %72 = extractvalue { i32, i32, i32, i32 } %71, 0, !dbg !23 + %73 = bitcast i32 %72 to <2 x bfloat>, !dbg !23 + %74 = extractvalue { i32, i32, i32, i32 } %71, 1, !dbg !23 + %75 = bitcast i32 %74 to <2 x bfloat>, !dbg !23 + %76 = extractvalue { i32, i32, i32, i32 } %71, 2, !dbg !23 + %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !23 + %78 = extractvalue { i32, i32, i32, i32 } %71, 3, !dbg !23 + %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !23 + %80 = insertelement <4 x i32> poison, i32 %15, i64 0, !dbg !10 + %81 = shufflevector <4 x i32> %80, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !10 + %82 = or disjoint <4 x i32> %81, , !dbg !10 + %83 = extractelement <4 x i32> %82, i64 3, !dbg !12 + %84 = srem i32 %83, 128, !dbg !12 + %85 = extractelement <4 x i32> %82, i64 2, !dbg !12 + %86 = srem i32 %85, 128, !dbg !12 + %87 = extractelement <4 x i32> %82, i64 1, !dbg !12 + %88 = srem i32 %87, 128, !dbg !12 + %89 = extractelement <4 x i32> %82, i64 0, !dbg !12 + %90 = srem i32 %89, 128, !dbg !12 + %91 = srem <4 x i32> %82, splat (i32 2), !dbg !24 + %92 = icmp slt <4 x i32> %91, splat (i32 1), !dbg !25 + %.lhs.trunc = trunc nsw i32 %84 to i8, !dbg !26 + %93 = sdiv i8 %.lhs.trunc, 2, !dbg !26 + %.sext = sext i8 %93 to i32, !dbg !26 + %.lhs.trunc1 = trunc nsw i32 %86 to i8, !dbg !26 + %94 = sdiv i8 %.lhs.trunc1, 2, !dbg !26 + %.sext2 = sext i8 %94 to i32, !dbg !26 + %.lhs.trunc3 = trunc nsw i32 %88 to i8, !dbg !26 + %95 = sdiv i8 %.lhs.trunc3, 2, !dbg !26 + %.sext4 = sext i8 %95 to i32, !dbg !26 + %.lhs.trunc5 = trunc nsw i32 %90 to i8, !dbg !26 + %96 = sdiv i8 %.lhs.trunc5, 2, !dbg !26 + %.sext6 = sext i8 %96 to i32, !dbg !26 + %97 = shl nsw i32 %.sext, 1, !dbg !27 + %98 = shl nsw i32 %.sext2, 1, !dbg !27 + %99 = shl nsw i32 %.sext4, 1, !dbg !27 + %100 = shl nsw i32 %.sext6, 1, !dbg !27 + %101 = or disjoint i32 %.decomposed, 1, !dbg !28 + %102 = or disjoint i32 %22, 1, !dbg !28 + %103 = or disjoint i32 %23, 1, !dbg !28 + %104 = or disjoint i32 %24, 1, !dbg !28 + %105 = shl nsw i32 %20, 7, !dbg !29 + %106 = add i32 %101, %105, !dbg !30 + %107 = or disjoint i32 %105, 1, !dbg !28 + %108 = add i32 %107, %97, !dbg !30 + %109 = add i32 %102, %105, !dbg !30 + %110 = add i32 %107, %98, !dbg !30 + %111 = add i32 %103, %105, !dbg !30 + %112 = add i32 %107, %99, !dbg !30 + %113 = add i32 %104, %105, !dbg !30 + %114 = add i32 %107, %100, !dbg !30 + %115 = sext i32 %106 to i64, !dbg !31 + %116 = getelementptr bfloat, ptr addrspace(1) %0, i64 %115, !dbg !31 + %117 = sext i32 %108 to i64, !dbg !31 + %118 = getelementptr bfloat, ptr addrspace(1) %0, i64 %117, !dbg !31 + %119 = sext i32 %109 to i64, !dbg !31 + %120 = getelementptr bfloat, ptr addrspace(1) %0, i64 %119, !dbg !31 + %121 = sext i32 %110 to i64, !dbg !31 + %122 = getelementptr bfloat, ptr addrspace(1) %0, i64 %121, !dbg !31 + %123 = sext i32 %111 to i64, !dbg !31 + %124 = getelementptr bfloat, ptr addrspace(1) %0, i64 %123, !dbg !31 + %125 = sext i32 %112 to i64, !dbg !31 + %126 = getelementptr bfloat, ptr addrspace(1) %0, i64 %125, !dbg !31 + %127 = sext i32 %113 to i64, !dbg !31 + %128 = getelementptr bfloat, ptr addrspace(1) %0, i64 %127, !dbg !31 + %129 = sext i32 %114 to i64, !dbg !31 + %130 = getelementptr bfloat, ptr addrspace(1) %0, i64 %129, !dbg !31 + %131 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32 + %132 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %116, i64 %131, i1 true) #2, !dbg !32 + %133 = bitcast i16 %132 to bfloat, !dbg !32 + %134 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32 + %135 = extractelement <4 x i1> %92, i64 3, !dbg !32 + %136 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %118, i64 %134, i1 %135) #2, !dbg !32 + %137 = bitcast i16 %136 to bfloat, !dbg !32 + %138 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32 + %139 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %120, i64 %138, i1 true) #2, !dbg !32 + %140 = bitcast i16 %139 to bfloat, !dbg !32 + %141 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32 + %142 = extractelement <4 x i1> %92, i64 2, !dbg !32 + %143 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %122, i64 %141, i1 %142) #2, !dbg !32 + %144 = bitcast i16 %143 to bfloat, !dbg !32 + %145 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32 + %146 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %124, i64 %145, i1 true) #2, !dbg !32 + %147 = bitcast i16 %146 to bfloat, !dbg !32 + %148 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32 + %149 = extractelement <4 x i1> %92, i64 1, !dbg !32 + %150 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %126, i64 %148, i1 %149) #2, !dbg !32 + %151 = bitcast i16 %150 to bfloat, !dbg !32 + %152 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32 + %153 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %128, i64 %152, i1 true) #2, !dbg !32 + %154 = bitcast i16 %153 to bfloat, !dbg !32 + %155 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32 + %156 = extractelement <4 x i1> %92, i64 0, !dbg !32 + %157 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %130, i64 %155, i1 %156) #2, !dbg !32 + %158 = bitcast i16 %157 to bfloat, !dbg !32 + %159 = fpext bfloat %133 to float, !dbg !33 + %160 = fpext bfloat %137 to float, !dbg !33 + %161 = fpext bfloat %140 to float, !dbg !33 + %162 = fpext bfloat %144 to float, !dbg !33 + %163 = fpext bfloat %147 to float, !dbg !33 + %164 = fpext bfloat %151 to float, !dbg !33 + %165 = fpext bfloat %154 to float, !dbg !33 + %166 = fpext bfloat %158 to float, !dbg !33 + %167 = fsub float 0.000000e+00, %159, !dbg !34 + %168 = fsub float 0.000000e+00, %160, !dbg !34 + %169 = fsub float 0.000000e+00, %161, !dbg !34 + %170 = fsub float 0.000000e+00, %162, !dbg !34 + %171 = fsub float 0.000000e+00, %163, !dbg !34 + %172 = fsub float 0.000000e+00, %164, !dbg !34 + %173 = fsub float 0.000000e+00, %165, !dbg !34 + %174 = fsub float 0.000000e+00, %166, !dbg !34 + %175 = extractelement <4 x i32> %91, i64 3, !dbg !35 + %176 = icmp sgt i32 %175, 0, !dbg !35 + %177 = extractelement <4 x i32> %91, i64 2, !dbg !35 + %178 = icmp sgt i32 %177, 0, !dbg !35 + %179 = extractelement <4 x i32> %91, i64 1, !dbg !35 + %180 = icmp sgt i32 %179, 0, !dbg !35 + %181 = extractelement <4 x i32> %91, i64 0, !dbg !35 + %182 = icmp sgt i32 %181, 0, !dbg !35 + %183 = add i32 %105, %.decomposed, !dbg !36 + %184 = add i32 %97, %105, !dbg !36 + %185 = add i32 %105, %22, !dbg !36 + %186 = add i32 %98, %105, !dbg !36 + %187 = add i32 %105, %23, !dbg !36 + %188 = add i32 %99, %105, !dbg !36 + %189 = add i32 %105, %24, !dbg !36 + %190 = add i32 %100, %105, !dbg !36 + %191 = sext i32 %183 to i64, !dbg !37 + %192 = getelementptr bfloat, ptr addrspace(1) %0, i64 %191, !dbg !37 + %193 = sext i32 %184 to i64, !dbg !37 + %194 = getelementptr bfloat, ptr addrspace(1) %0, i64 %193, !dbg !37 + %195 = sext i32 %185 to i64, !dbg !37 + %196 = getelementptr bfloat, ptr addrspace(1) %0, i64 %195, !dbg !37 + %197 = sext i32 %186 to i64, !dbg !37 + %198 = getelementptr bfloat, ptr addrspace(1) %0, i64 %197, !dbg !37 + %199 = sext i32 %187 to i64, !dbg !37 + %200 = getelementptr bfloat, ptr addrspace(1) %0, i64 %199, !dbg !37 + %201 = sext i32 %188 to i64, !dbg !37 + %202 = getelementptr bfloat, ptr addrspace(1) %0, i64 %201, !dbg !37 + %203 = sext i32 %189 to i64, !dbg !37 + %204 = getelementptr bfloat, ptr addrspace(1) %0, i64 %203, !dbg !37 + %205 = sext i32 %190 to i64, !dbg !37 + %206 = getelementptr bfloat, ptr addrspace(1) %0, i64 %205, !dbg !37 + %207 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38 + %208 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %192, i64 %207, i1 false) #2, !dbg !38 + %209 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38 + %210 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %194, i64 %209, i1 %176) #2, !dbg !38 + %211 = bitcast i16 %210 to bfloat, !dbg !38 + %212 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38 + %213 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %196, i64 %212, i1 false) #2, !dbg !38 + %214 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38 + %215 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %198, i64 %214, i1 %178) #2, !dbg !38 + %216 = bitcast i16 %215 to bfloat, !dbg !38 + %217 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38 + %218 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %200, i64 %217, i1 false) #2, !dbg !38 + %219 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38 + %220 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %202, i64 %219, i1 %180) #2, !dbg !38 + %221 = bitcast i16 %220 to bfloat, !dbg !38 + %222 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38 + %223 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %204, i64 %222, i1 false) #2, !dbg !38 + %224 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38 + %225 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %206, i64 %224, i1 %182) #2, !dbg !38 + %226 = bitcast i16 %225 to bfloat, !dbg !38 + %227 = fpext bfloat %211 to float, !dbg !39 + %228 = fpext bfloat %216 to float, !dbg !39 + %229 = fpext bfloat %221 to float, !dbg !39 + %230 = fpext bfloat %226 to float, !dbg !39 + %231 = select i1 %135, float %168, float %227, !dbg !40 + %232 = select i1 %142, float %170, float %228, !dbg !40 + %233 = select i1 %149, float %172, float %229, !dbg !40 + %234 = select i1 %156, float %174, float %230, !dbg !40 + %235 = getelementptr bfloat, ptr addrspace(1) %3, i64 %115, !dbg !41 + %236 = getelementptr bfloat, ptr addrspace(1) %3, i64 %117, !dbg !41 + %237 = getelementptr bfloat, ptr addrspace(1) %3, i64 %119, !dbg !41 + %238 = getelementptr bfloat, ptr addrspace(1) %3, i64 %121, !dbg !41 + %239 = getelementptr bfloat, ptr addrspace(1) %3, i64 %123, !dbg !41 + %240 = getelementptr bfloat, ptr addrspace(1) %3, i64 %125, !dbg !41 + %241 = getelementptr bfloat, ptr addrspace(1) %3, i64 %127, !dbg !41 + %242 = getelementptr bfloat, ptr addrspace(1) %3, i64 %129, !dbg !41 + %243 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42 + %244 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %235, i64 %243, i1 true) #2, !dbg !42 + %245 = bitcast i16 %244 to bfloat, !dbg !42 + %246 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42 + %247 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %236, i64 %246, i1 %135) #2, !dbg !42 + %248 = bitcast i16 %247 to bfloat, !dbg !42 + %249 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42 + %250 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %237, i64 %249, i1 true) #2, !dbg !42 + %251 = bitcast i16 %250 to bfloat, !dbg !42 + %252 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42 + %253 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %238, i64 %252, i1 %142) #2, !dbg !42 + %254 = bitcast i16 %253 to bfloat, !dbg !42 + %255 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42 + %256 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %239, i64 %255, i1 true) #2, !dbg !42 + %257 = bitcast i16 %256 to bfloat, !dbg !42 + %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42 + %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %240, i64 %258, i1 %149) #2, !dbg !42 + %260 = bitcast i16 %259 to bfloat, !dbg !42 + %261 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42 + %262 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %241, i64 %261, i1 true) #2, !dbg !42 + %263 = bitcast i16 %262 to bfloat, !dbg !42 + %264 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42 + %265 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %242, i64 %264, i1 %156) #2, !dbg !42 + %266 = bitcast i16 %265 to bfloat, !dbg !42 + %267 = fpext bfloat %245 to float, !dbg !43 + %268 = fpext bfloat %248 to float, !dbg !43 + %269 = fpext bfloat %251 to float, !dbg !43 + %270 = fpext bfloat %254 to float, !dbg !43 + %271 = fpext bfloat %257 to float, !dbg !43 + %272 = fpext bfloat %260 to float, !dbg !43 + %273 = fpext bfloat %263 to float, !dbg !43 + %274 = fpext bfloat %266 to float, !dbg !43 + %275 = fsub float 0.000000e+00, %267, !dbg !44 + %276 = fsub float 0.000000e+00, %268, !dbg !44 + %277 = fsub float 0.000000e+00, %269, !dbg !44 + %278 = fsub float 0.000000e+00, %270, !dbg !44 + %279 = fsub float 0.000000e+00, %271, !dbg !44 + %280 = fsub float 0.000000e+00, %272, !dbg !44 + %281 = fsub float 0.000000e+00, %273, !dbg !44 + %282 = fsub float 0.000000e+00, %274, !dbg !44 + %283 = getelementptr bfloat, ptr addrspace(1) %3, i64 %191, !dbg !45 + %284 = getelementptr bfloat, ptr addrspace(1) %3, i64 %193, !dbg !45 + %285 = getelementptr bfloat, ptr addrspace(1) %3, i64 %195, !dbg !45 + %286 = getelementptr bfloat, ptr addrspace(1) %3, i64 %197, !dbg !45 + %287 = getelementptr bfloat, ptr addrspace(1) %3, i64 %199, !dbg !45 + %288 = getelementptr bfloat, ptr addrspace(1) %3, i64 %201, !dbg !45 + %289 = getelementptr bfloat, ptr addrspace(1) %3, i64 %203, !dbg !45 + %290 = getelementptr bfloat, ptr addrspace(1) %3, i64 %205, !dbg !45 + %291 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46 + %292 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %283, i64 %291, i1 false) #2, !dbg !46 + %293 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46 + %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %284, i64 %293, i1 %176) #2, !dbg !46 + %295 = bitcast i16 %294 to bfloat, !dbg !46 + %296 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46 + %297 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %285, i64 %296, i1 false) #2, !dbg !46 + %298 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46 + %299 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %298, i1 %178) #2, !dbg !46 + %300 = bitcast i16 %299 to bfloat, !dbg !46 + %301 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46 + %302 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %287, i64 %301, i1 false) #2, !dbg !46 + %303 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46 + %304 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %303, i1 %180) #2, !dbg !46 + %305 = bitcast i16 %304 to bfloat, !dbg !46 + %306 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46 + %307 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %289, i64 %306, i1 false) #2, !dbg !46 + %308 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46 + %309 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %308, i1 %182) #2, !dbg !46 + %310 = bitcast i16 %309 to bfloat, !dbg !46 + %311 = fpext bfloat %295 to float, !dbg !47 + %312 = fpext bfloat %300 to float, !dbg !47 + %313 = fpext bfloat %305 to float, !dbg !47 + %314 = fpext bfloat %310 to float, !dbg !47 + %315 = select i1 %135, float %276, float %311, !dbg !40 + %316 = select i1 %142, float %278, float %312, !dbg !40 + %317 = select i1 %149, float %280, float %313, !dbg !40 + %318 = select i1 %156, float %282, float %314, !dbg !40 + %319 = getelementptr bfloat, ptr addrspace(1) %4, i64 %26, !dbg !48 + %320 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !49 + %321 = insertelement <2 x i32> poison, i32 %46, i64 0, !dbg !19 + %322 = insertelement <2 x i32> %321, i32 %47, i64 1, !dbg !19 + %323 = bitcast <2 x i32> %322 to <2 x float>, !dbg !19 + %324 = insertelement <2 x i32> poison, i32 %60, i64 0, !dbg !21 + %325 = insertelement <2 x i32> %324, i32 %61, i64 1, !dbg !21 + %326 = bitcast <2 x i32> %325 to <2 x float>, !dbg !21 + %327 = fmul <2 x float> %320, %323, !dbg !50 + %328 = insertelement <2 x float> poison, float %167, i64 0, !dbg !51 + %329 = insertelement <2 x float> %328, float %231, i64 1, !dbg !51 + %330 = fmul <2 x float> %329, %326, !dbg !51 + %331 = fadd <2 x float> %327, %330, !dbg !52 + %332 = fptrunc <2 x float> %331 to <2 x bfloat>, !dbg !53 + %333 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !49 + %334 = insertelement <2 x i32> poison, i32 %48, i64 0, !dbg !19 + %335 = insertelement <2 x i32> %334, i32 %49, i64 1, !dbg !19 + %336 = bitcast <2 x i32> %335 to <2 x float>, !dbg !19 + %337 = insertelement <2 x i32> poison, i32 %62, i64 0, !dbg !21 + %338 = insertelement <2 x i32> %337, i32 %63, i64 1, !dbg !21 + %339 = bitcast <2 x i32> %338 to <2 x float>, !dbg !21 + %340 = fmul <2 x float> %333, %336, !dbg !50 + %341 = insertelement <2 x float> poison, float %169, i64 0, !dbg !51 + %342 = insertelement <2 x float> %341, float %232, i64 1, !dbg !51 + %343 = fmul <2 x float> %342, %339, !dbg !51 + %344 = fadd <2 x float> %340, %343, !dbg !52 + %345 = fptrunc <2 x float> %344 to <2 x bfloat>, !dbg !53 + %346 = fpext <2 x bfloat> %34 to <2 x float>, !dbg !49 + %347 = insertelement <2 x i32> poison, i32 %52, i64 0, !dbg !19 + %348 = insertelement <2 x i32> %347, i32 %53, i64 1, !dbg !19 + %349 = bitcast <2 x i32> %348 to <2 x float>, !dbg !19 + %350 = insertelement <2 x i32> poison, i32 %66, i64 0, !dbg !21 + %351 = insertelement <2 x i32> %350, i32 %67, i64 1, !dbg !21 + %352 = bitcast <2 x i32> %351 to <2 x float>, !dbg !21 + %353 = fmul <2 x float> %346, %349, !dbg !50 + %354 = insertelement <2 x float> poison, float %171, i64 0, !dbg !51 + %355 = insertelement <2 x float> %354, float %233, i64 1, !dbg !51 + %356 = fmul <2 x float> %355, %352, !dbg !51 + %357 = fadd <2 x float> %353, %356, !dbg !52 + %358 = fptrunc <2 x float> %357 to <2 x bfloat>, !dbg !53 + %359 = fpext <2 x bfloat> %36 to <2 x float>, !dbg !49 + %360 = insertelement <2 x i32> poison, i32 %54, i64 0, !dbg !19 + %361 = insertelement <2 x i32> %360, i32 %55, i64 1, !dbg !19 + %362 = bitcast <2 x i32> %361 to <2 x float>, !dbg !19 + %363 = insertelement <2 x i32> poison, i32 %68, i64 0, !dbg !21 + %364 = insertelement <2 x i32> %363, i32 %69, i64 1, !dbg !21 + %365 = bitcast <2 x i32> %364 to <2 x float>, !dbg !21 + %366 = fmul <2 x float> %359, %362, !dbg !50 + %367 = insertelement <2 x float> poison, float %173, i64 0, !dbg !51 + %368 = insertelement <2 x float> %367, float %234, i64 1, !dbg !51 + %369 = fmul <2 x float> %368, %365, !dbg !51 + %370 = fadd <2 x float> %366, %369, !dbg !52 + %371 = fptrunc <2 x float> %370 to <2 x bfloat>, !dbg !53 + %372 = bitcast <2 x bfloat> %332 to i32, !dbg !53 + %373 = bitcast <2 x bfloat> %345 to i32, !dbg !53 + %374 = bitcast <2 x bfloat> %358 to i32, !dbg !53 + %375 = bitcast <2 x bfloat> %371 to i32, !dbg !53 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %372, i32 %373, i32 %374, i32 %375, ptr addrspace(1) %319) #2, !dbg !53 + %376 = getelementptr bfloat, ptr addrspace(1) %5, i64 %26, !dbg !54 + %377 = fpext <2 x bfloat> %73 to <2 x float>, !dbg !55 + %378 = fmul <2 x float> %323, %377, !dbg !56 + %379 = insertelement <2 x float> poison, float %275, i64 0, !dbg !57 + %380 = insertelement <2 x float> %379, float %315, i64 1, !dbg !57 + %381 = fmul <2 x float> %380, %326, !dbg !57 + %382 = fadd <2 x float> %378, %381, !dbg !58 + %383 = fptrunc <2 x float> %382 to <2 x bfloat>, !dbg !59 + %384 = fpext <2 x bfloat> %75 to <2 x float>, !dbg !55 + %385 = fmul <2 x float> %336, %384, !dbg !56 + %386 = insertelement <2 x float> poison, float %277, i64 0, !dbg !57 + %387 = insertelement <2 x float> %386, float %316, i64 1, !dbg !57 + %388 = fmul <2 x float> %387, %339, !dbg !57 + %389 = fadd <2 x float> %385, %388, !dbg !58 + %390 = fptrunc <2 x float> %389 to <2 x bfloat>, !dbg !59 + %391 = fpext <2 x bfloat> %77 to <2 x float>, !dbg !55 + %392 = fmul <2 x float> %349, %391, !dbg !56 + %393 = insertelement <2 x float> poison, float %279, i64 0, !dbg !57 + %394 = insertelement <2 x float> %393, float %317, i64 1, !dbg !57 + %395 = fmul <2 x float> %394, %352, !dbg !57 + %396 = fadd <2 x float> %392, %395, !dbg !58 + %397 = fptrunc <2 x float> %396 to <2 x bfloat>, !dbg !59 + %398 = fpext <2 x bfloat> %79 to <2 x float>, !dbg !55 + %399 = fmul <2 x float> %362, %398, !dbg !56 + %400 = insertelement <2 x float> poison, float %281, i64 0, !dbg !57 + %401 = insertelement <2 x float> %400, float %318, i64 1, !dbg !57 + %402 = fmul <2 x float> %401, %365, !dbg !57 + %403 = fadd <2 x float> %399, %402, !dbg !58 + %404 = fptrunc <2 x float> %403 to <2 x bfloat>, !dbg !59 + %405 = bitcast <2 x bfloat> %383 to i32, !dbg !59 + %406 = bitcast <2 x bfloat> %390 to i32, !dbg !59 + %407 = bitcast <2 x bfloat> %397 to i32, !dbg !59 + %408 = bitcast <2 x bfloat> %404 to i32, !dbg !59 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %405, i32 %406, i32 %407, i32 %408, ptr addrspace(1) %376) #2, !dbg !59 + ret void, !dbg !60 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3", linkageName: "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 19, scope: !4) +!12 = !DILocation(line: 24, column: 19, scope: !4) +!13 = !DILocation(line: 25, column: 19, scope: !4) +!14 = !DILocation(line: 27, column: 30, scope: !4) +!15 = !DILocation(line: 27, column: 35, scope: !4) +!16 = !DILocation(line: 28, column: 39, scope: !4) +!17 = !DILocation(line: 28, column: 35, scope: !4) +!18 = !DILocation(line: 28, column: 30, scope: !4) +!19 = !DILocation(line: 28, column: 44, scope: !4) +!20 = !DILocation(line: 29, column: 31, scope: !4) +!21 = !DILocation(line: 29, column: 45, scope: !4) +!22 = !DILocation(line: 30, column: 31, scope: !4) +!23 = !DILocation(line: 30, column: 36, scope: !4) +!24 = !DILocation(line: 33, column: 17, scope: !4) +!25 = !DILocation(line: 37, column: 18, scope: !4) +!26 = !DILocation(line: 38, column: 43, scope: !4) +!27 = !DILocation(line: 38, column: 37, scope: !4) +!28 = !DILocation(line: 38, column: 34, scope: !4) +!29 = !DILocation(line: 38, column: 52, scope: !4) +!30 = !DILocation(line: 38, column: 48, scope: !4) +!31 = !DILocation(line: 38, column: 30, scope: !4) +!32 = !DILocation(line: 38, column: 57, scope: !4) +!33 = !DILocation(line: 38, column: 107, scope: !4) +!34 = !DILocation(line: 39, column: 13, scope: !4) +!35 = !DILocation(line: 42, column: 20, scope: !4) +!36 = !DILocation(line: 45, column: 45, scope: !4) +!37 = !DILocation(line: 45, column: 31, scope: !4) +!38 = !DILocation(line: 45, column: 54, scope: !4) +!39 = !DILocation(line: 45, column: 105, scope: !4) +!40 = !DILocation(line: 0, scope: !4) +!41 = !DILocation(line: 53, column: 31, scope: !4) +!42 = !DILocation(line: 53, column: 58, scope: !4) +!43 = !DILocation(line: 53, column: 108, scope: !4) +!44 = !DILocation(line: 54, column: 13, scope: !4) +!45 = !DILocation(line: 57, column: 31, scope: !4) +!46 = !DILocation(line: 57, column: 54, scope: !4) +!47 = !DILocation(line: 57, column: 105, scope: !4) +!48 = !DILocation(line: 63, column: 25, scope: !4) +!49 = !DILocation(line: 27, column: 44, scope: !4) +!50 = !DILocation(line: 32, column: 18, scope: !4) +!51 = !DILocation(line: 48, column: 20, scope: !4) +!52 = !DILocation(line: 49, column: 19, scope: !4) +!53 = !DILocation(line: 63, column: 37, scope: !4) +!54 = !DILocation(line: 64, column: 25, scope: !4) +!55 = !DILocation(line: 30, column: 45, scope: !4) +!56 = !DILocation(line: 52, column: 20, scope: !4) +!57 = !DILocation(line: 60, column: 20, scope: !4) +!58 = !DILocation(line: 61, column: 20, scope: !4) +!59 = !DILocation(line: 64, column: 37, scope: !4) +!60 = !DILocation(line: 64, column: 4, scope: !4) diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..a8d7f2e65b95579f07f8b739e37d4a36c6983108 --- /dev/null +++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx @@ -0,0 +1,971 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3 // -- Begin function triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3 + // @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3 +.visible .entry triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3( + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_5, + .param .u32 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_6, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_7, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_8 +) +.reqntid 128 +{ + .reg .pred %p<11>; + .reg .b16 %rs<74>; + .reg .b32 %r<208>; + .reg .b64 %rd<99>; + .loc 1 18 0 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:18:0 + +// %bb.0: + ld.param.b64 %rd75, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_0]; + ld.param.b64 %rd76, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_1]; +$L__tmp0: + .loc 1 20 28 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:20:28 + mov.u32 %r33, %ctaid.x; + .loc 1 20 33 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:20:33 + shl.b32 %r34, %r33, 10; + ld.param.b64 %rd77, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_2]; + ld.param.b64 %rd78, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_3]; + .loc 1 21 36 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:36 + mov.u32 %r35, %tid.x; + shl.b32 %r36, %r35, 3; + ld.param.b64 %rd79, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_4]; + and.b32 %r37, %r36, 1016; + ld.param.b64 %rd80, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_5]; + .loc 1 21 23 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:23 + or.b32 %r38, %r37, %r34; + .loc 1 21 36 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:36 + or.b32 %r39, %r34, %r36; + .loc 1 21 23 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:23 + or.b32 %r40, %r39, 2; + or.b32 %r41, %r39, 4; + or.b32 %r42, %r39, 6; + .loc 1 26 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:26:19 + bfe.s32 %r43, %r33, 21, 1; + .loc 1 24 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:24:19 + shr.u32 %r44, %r43, 25; + .loc 1 26 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:26:19 + add.s32 %r45, %r38, %r44; + .loc 1 24 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:24:19 + and.b32 %r46, %r45, -128; + sub.s32 %r47, %r38, %r46; + add.s32 %r48, %r40, %r44; + and.b32 %r49, %r48, -128; + sub.s32 %r50, %r40, %r49; + add.s32 %r51, %r41, %r44; + and.b32 %r52, %r51, -128; + sub.s32 %r53, %r41, %r52; + add.s32 %r54, %r42, %r44; + and.b32 %r55, %r54, -128; + sub.s32 %r56, %r42, %r55; + .loc 1 25 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:25:19 + shr.u32 %r57, %r43, 20; + add.s32 %r58, %r38, %r57; + shr.s32 %r59, %r58, 12; + .loc 1 27 30 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:30 + mul.wide.s32 %rd81, %r38, 2; + add.s64 %rd1, %rd75, %rd81; + .loc 1 27 35 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:35 + // begin inline asm + mov.u32 %r1, 0x0; + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 28 39 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:39 + shl.b32 %r60, %r59, 7; + .loc 1 28 35 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:35 + add.s32 %r61, %r60, %r47; + add.s32 %r62, %r60, %r53; + .loc 1 28 30 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:30 + mul.wide.s32 %rd82, %r61, 4; + add.s64 %rd2, %rd76, %rd82; + mul.wide.s32 %rd83, %r62, 4; + add.s64 %rd4, %rd76, %rd83; + .loc 1 28 44 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:44 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r5, 0x0; + mov.u32 %r6, 0x0; + mov.u32 %r7, 0x0; + mov.u32 %r8, 0x0; + ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ], %rd3; + // end inline asm + // begin inline asm + mov.u64 %rd5, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd5, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r9, 0x0; + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd4 + 0 ], %rd5; + // end inline asm + .loc 1 29 31 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:29:31 + add.s64 %rd6, %rd77, %rd82; + add.s64 %rd8, %rd77, %rd83; + .loc 1 29 45 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:29:45 + // begin inline asm + mov.u64 %rd7, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r13, 0x0; + mov.u32 %r14, 0x0; + mov.u32 %r15, 0x0; + mov.u32 %r16, 0x0; + ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r13, %r14, %r15, %r16 }, [ %rd6 + 0 ], %rd7; + // end inline asm + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r17, 0x0; + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r17, %r18, %r19, %r20 }, [ %rd8 + 0 ], %rd9; + // end inline asm + .loc 1 30 31 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:31 + add.s64 %rd10, %rd78, %rd81; + .loc 1 30 36 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:36 + // begin inline asm + mov.u32 %r21, 0x0; + mov.u32 %r22, 0x0; + mov.u32 %r23, 0x0; + mov.u32 %r24, 0x0; + ld.global.v4.b32 { %r21, %r22, %r23, %r24 }, [ %rd10 + 0 ]; + // end inline asm + .loc 1 21 23 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:23 + or.b32 %r63, %r38, 7; + or.b32 %r64, %r38, 5; + or.b32 %r65, %r38, 3; + or.b32 %r66, %r38, 1; + .loc 1 24 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:24:19 + add.s32 %r67, %r66, %r44; + and.b32 %r68, %r67, 65408; + sub.s32 %r69, %r66, %r68; + add.s32 %r70, %r65, %r44; + and.b32 %r71, %r70, 65408; + sub.s32 %r72, %r65, %r71; + add.s32 %r73, %r64, %r44; + and.b32 %r74, %r73, 65408; + sub.s32 %r75, %r64, %r74; + add.s32 %r76, %r63, %r44; + and.b32 %r77, %r76, 65408; + sub.s32 %r78, %r63, %r77; + .loc 1 33 17 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:33:17 + bfe.u32 %r79, %r33, 21, 1; + add.s32 %r80, %r66, %r79; + and.b32 %r81, %r80, -6; + sub.s32 %r82, %r66, %r81; + add.s32 %r83, %r65, %r79; + and.b32 %r84, %r83, -2; + sub.s32 %r85, %r65, %r84; + add.s32 %r86, %r64, %r79; + and.b32 %r87, %r86, -2; + sub.s32 %r88, %r64, %r87; + add.s32 %r89, %r63, %r79; + and.b32 %r90, %r89, -2; + sub.s32 %r91, %r63, %r90; + .loc 1 37 18 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:37:18 + setp.lt.s32 %p10, %r91, 1; + setp.lt.s32 %p9, %r88, 1; + setp.lt.s32 %p8, %r85, 1; + setp.lt.s32 %p7, %r82, 1; + .loc 1 38 43 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:43 + cvt.u16.u32 %rs34, %r69; + and.b16 %rs35, %rs34, 128; + shr.u16 %rs36, %rs35, 7; + add.s16 %rs37, %rs34, %rs36; + cvt.s16.s8 %rs38, %rs37; + shr.s16 %rs39, %rs38, 1; + cvt.u16.u32 %rs40, %r72; + and.b16 %rs41, %rs40, 128; + shr.u16 %rs42, %rs41, 7; + add.s16 %rs43, %rs40, %rs42; + cvt.s16.s8 %rs44, %rs43; + shr.s16 %rs45, %rs44, 1; + cvt.u16.u32 %rs46, %r75; + and.b16 %rs47, %rs46, 128; + shr.u16 %rs48, %rs47, 7; + add.s16 %rs49, %rs46, %rs48; + cvt.s16.s8 %rs50, %rs49; + shr.s16 %rs51, %rs50, 1; + cvt.u16.u32 %rs52, %r78; + and.b16 %rs53, %rs52, 128; + shr.u16 %rs54, %rs53, 7; + add.s16 %rs55, %rs52, %rs54; + cvt.s16.s8 %rs56, %rs55; + shr.s16 %rs57, %rs56, 1; + .loc 1 38 37 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:37 + mul.wide.s16 %r92, %rs39, 2; + mul.wide.s16 %r93, %rs45, 2; + mul.wide.s16 %r94, %rs51, 2; + mul.wide.s16 %r95, %rs57, 2; + .loc 1 38 34 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:34 + or.b32 %r96, %r46, 1; + .loc 1 38 48 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:48 + add.s32 %r97, %r96, %r92; + add.s32 %r98, %r46, %r50; + or.b32 %r99, %r98, 1; + add.s32 %r100, %r96, %r93; + add.s32 %r101, %r46, %r53; + or.b32 %r102, %r101, 1; + add.s32 %r103, %r96, %r94; + add.s32 %r104, %r46, %r56; + or.b32 %r105, %r104, 1; + add.s32 %r106, %r96, %r95; + .loc 1 38 30 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:30 + mul.wide.s32 %rd84, %r66, 2; + add.s64 %rd11, %rd75, %rd84; + mul.wide.s32 %rd85, %r97, 2; + add.s64 %rd13, %rd75, %rd85; + mul.wide.s32 %rd86, %r99, 2; + add.s64 %rd15, %rd75, %rd86; + mul.wide.s32 %rd87, %r100, 2; + add.s64 %rd17, %rd75, %rd87; + mul.wide.s32 %rd88, %r102, 2; + add.s64 %rd19, %rd75, %rd88; + mul.wide.s32 %rd89, %r103, 2; + add.s64 %rd21, %rd75, %rd89; + mul.wide.s32 %rd90, %r105, 2; + add.s64 %rd23, %rd75, %rd90; + mul.wide.s32 %rd91, %r106, 2; + add.s64 %rd25, %rd75, %rd91; + .loc 1 38 57 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:57 + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0; + // end inline asm + mov.b16 %rs2, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd11 + 0 ], %rd12; + // end inline asm + // begin inline asm + mov.u64 %rd14, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd13 + 0 ], %rd14; + // end inline asm + // begin inline asm + mov.u64 %rd16, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs4, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd15 + 0 ], %rd16; + // end inline asm + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, %rs2; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd17 + 0 ], %rd18; + // end inline asm + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs6, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd19 + 0 ], %rd20; + // end inline asm + // begin inline asm + mov.u64 %rd22, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd22, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs7, %rs2; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd21 + 0 ], %rd22; + // end inline asm + // begin inline asm + mov.u64 %rd24, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs8, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd23 + 0 ], %rd24; + // end inline asm + // begin inline asm + mov.u64 %rd26, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd26, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs9, %rs2; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd25 + 0 ], %rd26; + // end inline asm + .loc 1 38 107 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:107 + cvt.f32.bf16 %r107, %rs1; + cvt.f32.bf16 %r108, %rs3; + cvt.f32.bf16 %r109, %rs4; + cvt.f32.bf16 %r110, %rs5; + cvt.f32.bf16 %r111, %rs6; + cvt.f32.bf16 %r112, %rs7; + cvt.f32.bf16 %r113, %rs8; + cvt.f32.bf16 %r114, %rs9; + mov.b32 %r115, 0f00000000; + .loc 1 39 13 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:39:13 + sub.f32 %r116, %r115, %r107; + sub.f32 %r117, %r115, %r108; + sub.f32 %r118, %r115, %r109; + sub.f32 %r119, %r115, %r110; + sub.f32 %r120, %r115, %r111; + sub.f32 %r121, %r115, %r112; + sub.f32 %r122, %r115, %r113; + sub.f32 %r123, %r115, %r114; + .loc 1 42 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:42:20 + setp.gt.s32 %p3, %r82, 0; + setp.gt.s32 %p4, %r85, 0; + setp.gt.s32 %p5, %r88, 0; + setp.gt.s32 %p6, %r91, 0; + .loc 1 45 45 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:45 + add.s32 %r124, %r92, %r46; + add.s32 %r125, %r93, %r46; + add.s32 %r126, %r94, %r46; + add.s32 %r127, %r95, %r46; + .loc 1 45 31 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:31 + mul.wide.s32 %rd92, %r124, 2; + add.s64 %rd28, %rd75, %rd92; + mul.wide.s32 %rd93, %r98, 2; + add.s64 %rd30, %rd75, %rd93; + mul.wide.s32 %rd94, %r125, 2; + add.s64 %rd32, %rd75, %rd94; + mul.wide.s32 %rd95, %r101, 2; + add.s64 %rd34, %rd75, %rd95; + mul.wide.s32 %rd96, %r126, 2; + add.s64 %rd36, %rd75, %rd96; + mul.wide.s32 %rd97, %r104, 2; + add.s64 %rd38, %rd75, %rd97; + mul.wide.s32 %rd98, %r127, 2; + add.s64 %rd40, %rd75, %rd98; + .loc 1 45 54 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:54 + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0; + // end inline asm + mov.pred %p2, 0; + // begin inline asm + mov.u16 %rs10, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd1 + 0 ], %rd27; + // end inline asm + // begin inline asm + mov.u64 %rd29, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd28 + 0 ], %rd29; + // end inline asm + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd30 + 0 ], %rd31; + // end inline asm + // begin inline asm + mov.u64 %rd33, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd32 + 0 ], %rd33; + // end inline asm + // begin inline asm + mov.u64 %rd35, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs14, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd34 + 0 ], %rd35; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs2; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd36 + 0 ], %rd37; + // end inline asm + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs16, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd38 + 0 ], %rd39; + // end inline asm + // begin inline asm + mov.u64 %rd41, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs2; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd40 + 0 ], %rd41; + // end inline asm + .loc 1 45 105 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:105 + cvt.f32.bf16 %r128, %rs11; + cvt.f32.bf16 %r129, %rs13; + cvt.f32.bf16 %r130, %rs15; + cvt.f32.bf16 %r131, %rs17; + .loc 1 0 0 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:0 + selp.f32 %r132, %r117, %r128, %p7; + selp.f32 %r133, %r119, %r129, %p8; + selp.f32 %r134, %r121, %r130, %p9; + selp.f32 %r135, %r123, %r131, %p10; + .loc 1 53 31 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:31 + add.s64 %rd42, %rd78, %rd84; + add.s64 %rd44, %rd78, %rd85; + add.s64 %rd46, %rd78, %rd86; + add.s64 %rd48, %rd78, %rd87; + add.s64 %rd50, %rd78, %rd88; + add.s64 %rd52, %rd78, %rd89; + add.s64 %rd54, %rd78, %rd90; + add.s64 %rd56, %rd78, %rd91; + .loc 1 53 58 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:58 + // begin inline asm + mov.u64 %rd43, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs18, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd42 + 0 ], %rd43; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs2; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd44 + 0 ], %rd45; + // end inline asm + // begin inline asm + mov.u64 %rd47, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs20, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd46 + 0 ], %rd47; + // end inline asm + // begin inline asm + mov.u64 %rd49, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs2; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd48 + 0 ], %rd49; + // end inline asm + // begin inline asm + mov.u64 %rd51, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs22, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd50 + 0 ], %rd51; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs2; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd52 + 0 ], %rd53; + // end inline asm + // begin inline asm + mov.u64 %rd55, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs24, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd54 + 0 ], %rd55; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs25, %rs2; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd56 + 0 ], %rd57; + // end inline asm + .loc 1 53 108 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:108 + cvt.f32.bf16 %r136, %rs18; + cvt.f32.bf16 %r137, %rs19; + cvt.f32.bf16 %r138, %rs20; + cvt.f32.bf16 %r139, %rs21; + cvt.f32.bf16 %r140, %rs22; + cvt.f32.bf16 %r141, %rs23; + cvt.f32.bf16 %r142, %rs24; + cvt.f32.bf16 %r143, %rs25; + .loc 1 54 13 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:54:13 + sub.f32 %r144, %r115, %r136; + sub.f32 %r145, %r115, %r137; + sub.f32 %r146, %r115, %r138; + sub.f32 %r147, %r115, %r139; + sub.f32 %r148, %r115, %r140; + sub.f32 %r149, %r115, %r141; + sub.f32 %r150, %r115, %r142; + sub.f32 %r151, %r115, %r143; + .loc 1 57 31 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:31 + add.s64 %rd59, %rd78, %rd92; + add.s64 %rd61, %rd78, %rd93; + add.s64 %rd63, %rd78, %rd94; + add.s64 %rd65, %rd78, %rd95; + add.s64 %rd67, %rd78, %rd96; + add.s64 %rd69, %rd78, %rd97; + add.s64 %rd71, %rd78, %rd98; + .loc 1 57 54 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:54 + // begin inline asm + mov.u64 %rd58, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd58, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs26, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd10 + 0 ], %rd58; + // end inline asm + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd59 + 0 ], %rd60; + // end inline asm + // begin inline asm + mov.u64 %rd62, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd62, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs28, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd61 + 0 ], %rd62; + // end inline asm + // begin inline asm + mov.u64 %rd64, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs29, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd63 + 0 ], %rd64; + // end inline asm + // begin inline asm + mov.u64 %rd66, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd66, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs30, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd65 + 0 ], %rd66; + // end inline asm + // begin inline asm + mov.u64 %rd68, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs31, %rs2; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd67 + 0 ], %rd68; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs32, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd69 + 0 ], %rd70; + // end inline asm + // begin inline asm + mov.u64 %rd72, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd72, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs33, %rs2; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd71 + 0 ], %rd72; + // end inline asm + .loc 1 57 105 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:105 + cvt.f32.bf16 %r152, %rs27; + cvt.f32.bf16 %r153, %rs29; + cvt.f32.bf16 %r154, %rs31; + cvt.f32.bf16 %r155, %rs33; + .loc 1 0 0 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:0 + selp.f32 %r156, %r145, %r152, %p7; + selp.f32 %r157, %r147, %r153, %p8; + selp.f32 %r158, %r149, %r154, %p9; + selp.f32 %r159, %r151, %r155, %p10; + .loc 1 63 25 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:25 + add.s64 %rd73, %rd79, %rd81; + .loc 1 27 44 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:44 + mov.b32 {%rs58, %rs59}, %r1; + cvt.f32.bf16 %r160, %rs58; + cvt.f32.bf16 %r161, %rs59; + .loc 1 48 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:48:20 + mul.f32 %r162, %r132, %r14; + mul.f32 %r163, %r116, %r13; + .loc 1 49 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:49:19 + fma.rn.f32 %r164, %r161, %r6, %r162; + fma.rn.f32 %r165, %r160, %r5, %r163; + .loc 1 63 37 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:37 + cvt.rn.bf16x2.f32 %r25, %r164, %r165; + .loc 1 27 44 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:44 + mov.b32 {%rs60, %rs61}, %r2; + cvt.f32.bf16 %r166, %rs60; + cvt.f32.bf16 %r167, %rs61; + .loc 1 48 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:48:20 + mul.f32 %r168, %r133, %r16; + mul.f32 %r169, %r118, %r15; + .loc 1 49 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:49:19 + fma.rn.f32 %r170, %r167, %r8, %r168; + fma.rn.f32 %r171, %r166, %r7, %r169; + .loc 1 63 37 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:37 + cvt.rn.bf16x2.f32 %r26, %r170, %r171; + .loc 1 27 44 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:44 + mov.b32 {%rs62, %rs63}, %r3; + cvt.f32.bf16 %r172, %rs62; + cvt.f32.bf16 %r173, %rs63; + .loc 1 48 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:48:20 + mul.f32 %r174, %r134, %r18; + mul.f32 %r175, %r120, %r17; + .loc 1 49 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:49:19 + fma.rn.f32 %r176, %r173, %r10, %r174; + fma.rn.f32 %r177, %r172, %r9, %r175; + .loc 1 63 37 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:37 + cvt.rn.bf16x2.f32 %r27, %r176, %r177; + .loc 1 27 44 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:44 + mov.b32 {%rs64, %rs65}, %r4; + cvt.f32.bf16 %r178, %rs64; + cvt.f32.bf16 %r179, %rs65; + .loc 1 48 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:48:20 + mul.f32 %r180, %r135, %r20; + mul.f32 %r181, %r122, %r19; + .loc 1 49 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:49:19 + fma.rn.f32 %r182, %r179, %r12, %r180; + fma.rn.f32 %r183, %r178, %r11, %r181; + .loc 1 63 37 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:37 + cvt.rn.bf16x2.f32 %r28, %r182, %r183; + // begin inline asm + st.global.v4.b32 [ %rd73 + 0 ], { %r25, %r26, %r27, %r28 }; + // end inline asm + .loc 1 64 25 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:25 + add.s64 %rd74, %rd80, %rd81; + .loc 1 30 45 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:45 + mov.b32 {%rs66, %rs67}, %r21; + cvt.f32.bf16 %r184, %rs66; + cvt.f32.bf16 %r185, %rs67; + .loc 1 60 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:60:20 + mul.f32 %r186, %r156, %r14; + mul.f32 %r187, %r144, %r13; + .loc 1 61 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:61:20 + fma.rn.f32 %r188, %r6, %r185, %r186; + fma.rn.f32 %r189, %r5, %r184, %r187; + .loc 1 64 37 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:37 + cvt.rn.bf16x2.f32 %r29, %r188, %r189; + .loc 1 30 45 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:45 + mov.b32 {%rs68, %rs69}, %r22; + cvt.f32.bf16 %r190, %rs68; + cvt.f32.bf16 %r191, %rs69; + .loc 1 60 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:60:20 + mul.f32 %r192, %r157, %r16; + mul.f32 %r193, %r146, %r15; + .loc 1 61 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:61:20 + fma.rn.f32 %r194, %r8, %r191, %r192; + fma.rn.f32 %r195, %r7, %r190, %r193; + .loc 1 64 37 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:37 + cvt.rn.bf16x2.f32 %r30, %r194, %r195; + .loc 1 30 45 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:45 + mov.b32 {%rs70, %rs71}, %r23; + cvt.f32.bf16 %r196, %rs70; + cvt.f32.bf16 %r197, %rs71; + .loc 1 60 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:60:20 + mul.f32 %r198, %r158, %r18; + mul.f32 %r199, %r148, %r17; + .loc 1 61 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:61:20 + fma.rn.f32 %r200, %r10, %r197, %r198; + fma.rn.f32 %r201, %r9, %r196, %r199; + .loc 1 64 37 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:37 + cvt.rn.bf16x2.f32 %r31, %r200, %r201; + .loc 1 30 45 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:45 + mov.b32 {%rs72, %rs73}, %r24; + cvt.f32.bf16 %r202, %rs72; + cvt.f32.bf16 %r203, %rs73; + .loc 1 60 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:60:20 + mul.f32 %r204, %r159, %r20; + mul.f32 %r205, %r150, %r19; + .loc 1 61 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:61:20 + fma.rn.f32 %r206, %r12, %r203, %r204; + fma.rn.f32 %r207, %r11, %r202, %r205; + .loc 1 64 37 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:37 + cvt.rn.bf16x2.f32 %r32, %r206, %r207; + // begin inline asm + st.global.v4.b32 [ %rd74 + 0 ], { %r29, %r30, %r31, %r32 }; + // end inline asm + .loc 1 64 4 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 106 +.b8 54 +.b8 54 +.b8 116 +.b8 103 +.b8 98 +.b8 102 +.b8 113 +.b8 120 +.b8 55 +.b8 114 +.b8 104 +.b8 121 +.b8 116 +.b8 99 +.b8 121 +.b8 119 +.b8 109 +.b8 106 +.b8 100 +.b8 99 +.b8 105 +.b8 109 +.b8 110 +.b8 119 +.b8 119 +.b8 116 +.b8 113 +.b8 54 +.b8 120 +.b8 106 +.b8 103 +.b8 98 +.b8 50 +.b8 113 +.b8 98 +.b8 113 +.b8 98 +.b8 120 +.b8 120 +.b8 111 +.b8 110 +.b8 97 +.b8 108 +.b8 100 +.b8 111 +.b8 116 +.b8 120 +.b8 54 +.b8 51 +.b8 118 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 106 +.b8 54 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source new file mode 100644 index 0000000000000000000000000000000000000000..8cc7749d0e4718068c7d84bb8b1835729b30272c --- /dev/null +++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source @@ -0,0 +1,352 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0) +#loc81 = loc("in_ptr0"(#loc)) +#loc82 = loc("in_ptr1"(#loc)) +#loc83 = loc("in_ptr2"(#loc)) +#loc84 = loc("in_ptr3"(#loc)) +#loc85 = loc("out_ptr0"(#loc)) +#loc86 = loc("out_ptr1"(#loc)) +#loc87 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc88) + %xoffset = tt.get_program_id x : i32 loc(#loc89) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc90) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc90) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc90) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc91) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc92) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc92) + %xmask = arith.constant true loc(#loc93) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc93) + %x0 = arith.constant 128 : i32 loc(#loc94) + %x0_7 = arith.constant 128 : i32 loc(#loc94) + %x0_8 = arith.constant dense<128> : tensor<1024xi32> loc(#loc94) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc94) + %x2 = arith.constant 4096 : i32 loc(#loc95) + %x2_10 = arith.constant 4096 : i32 loc(#loc95) + %x2_11 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc95) + %x2_12 = arith.divsi %xindex_5, %x2_11 : tensor<1024xi32> loc(#loc95) + %x4 = arith.constant 128 : i32 loc(#loc96) + %x4_13 = arith.constant 128 : i32 loc(#loc96) + %x4_14 = arith.constant dense<128> : tensor<1024xi32> loc(#loc96) + %x4_15 = arith.divsi %xindex_5, %x4_14 : tensor<1024xi32> loc(#loc96) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc97) + %tmp0_16 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc97) + %tmp0_17 = tt.load %tmp0_16 : tensor<1024x!tt.ptr> loc(#loc98) + %tmp0_18 = arith.extf %tmp0_17 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc99) + %tmp2 = arith.constant 128 : i32 loc(#loc100) + %tmp2_19 = arith.constant 128 : i32 loc(#loc100) + %tmp2_20 = arith.constant dense<128> : tensor<1024xi32> loc(#loc100) + %tmp2_21 = arith.muli %tmp2_20, %x2_12 : tensor<1024xi32> loc(#loc100) + %tmp2_22 = arith.addi %x0_9, %tmp2_21 : tensor<1024xi32> loc(#loc101) + %tmp2_23 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc102) + %tmp2_24 = tt.addptr %tmp2_23, %tmp2_22 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc102) + %tmp2_25 = tt.load %tmp2_24 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc103) + %tmp19 = arith.constant 128 : i32 loc(#loc104) + %tmp19_26 = arith.constant 128 : i32 loc(#loc104) + %tmp19_27 = arith.constant dense<128> : tensor<1024xi32> loc(#loc104) + %tmp19_28 = arith.muli %tmp19_27, %x2_12 : tensor<1024xi32> loc(#loc104) + %tmp19_29 = arith.addi %x0_9, %tmp19_28 : tensor<1024xi32> loc(#loc105) + %tmp19_30 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc106) + %tmp19_31 = tt.addptr %tmp19_30, %tmp19_29 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc106) + %tmp19_32 = tt.load %tmp19_31 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc107) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc108) + %tmp23_33 = tt.addptr %tmp23, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc108) + %tmp23_34 = tt.load %tmp23_33 : tensor<1024x!tt.ptr> loc(#loc109) + %tmp23_35 = arith.extf %tmp23_34 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc110) + %tmp3 = arith.mulf %tmp0_18, %tmp2_25 : tensor<1024xf32> loc(#loc111) + %tmp4 = arith.constant 2 : i32 loc(#loc112) + %tmp4_36 = arith.constant 2 : i32 loc(#loc112) + %tmp4_37 = arith.constant dense<2> : tensor<1024xi32> loc(#loc112) + %tmp4_38 = arith.remsi %xindex_5, %tmp4_37 : tensor<1024xi32> loc(#loc112) + %tmp5 = arith.constant 0 : i64 loc(#loc113) + %tmp5_39 = arith.constant dense<0> : tensor<1xi64> loc(#loc113) + %tmp6 = arith.extsi %tmp4_38 : tensor<1024xi32> to tensor<1024xi64> loc(#loc114) + %tmp6_40 = arith.constant dense<0> : tensor<1024xi64> loc(#loc114) + %tmp6_41 = arith.cmpi sge, %tmp6, %tmp6_40 : tensor<1024xi64> loc(#loc114) + %tmp7 = arith.constant 1 : i64 loc(#loc115) + %tmp7_42 = arith.constant dense<1> : tensor<1xi64> loc(#loc115) + %tmp8 = arith.extsi %tmp4_38 : tensor<1024xi32> to tensor<1024xi64> loc(#loc116) + %tmp8_43 = arith.constant dense<1> : tensor<1024xi64> loc(#loc116) + %tmp8_44 = arith.cmpi slt, %tmp8, %tmp8_43 : tensor<1024xi64> loc(#loc116) + %tmp9 = arith.constant 2 : i32 loc(#loc117) + %tmp9_45 = arith.constant 2 : i32 loc(#loc117) + %tmp9_46 = arith.constant dense<2> : tensor<1024xi32> loc(#loc117) + %tmp9_47 = arith.divsi %x0_9, %tmp9_46 : tensor<1024xi32> loc(#loc117) + %tmp9_48 = arith.constant 2 : i32 loc(#loc118) + %tmp9_49 = arith.constant 2 : i32 loc(#loc118) + %tmp9_50 = arith.constant dense<2> : tensor<1024xi32> loc(#loc118) + %tmp9_51 = arith.muli %tmp9_50, %tmp9_47 : tensor<1024xi32> loc(#loc118) + %tmp9_52 = arith.constant 1 : i32 loc(#loc119) + %tmp9_53 = arith.constant 1 : i32 loc(#loc119) + %tmp9_54 = arith.constant dense<1> : tensor<1024xi32> loc(#loc119) + %tmp9_55 = arith.addi %tmp9_54, %tmp9_51 : tensor<1024xi32> loc(#loc119) + %tmp9_56 = arith.constant 128 : i32 loc(#loc120) + %tmp9_57 = arith.constant 128 : i32 loc(#loc120) + %tmp9_58 = arith.constant dense<128> : tensor<1024xi32> loc(#loc120) + %tmp9_59 = arith.muli %tmp9_58, %x4_15 : tensor<1024xi32> loc(#loc120) + %tmp9_60 = arith.addi %tmp9_55, %tmp9_59 : tensor<1024xi32> loc(#loc121) + %tmp9_61 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc122) + %tmp9_62 = tt.addptr %tmp9_61, %tmp9_60 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc122) + %tmp9_63 = arith.constant 0.000000e+00 : f32 loc(#loc123) + %tmp9_64 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc123) + %tmp9_65 = arith.truncf %tmp9_64 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc123) + %tmp9_66 = tt.load %tmp9_62, %tmp8_44, %tmp9_65 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc123) + %tmp9_67 = arith.extf %tmp9_66 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc124) + %tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc125) + %tmp10_68 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc125) + %tmp10_69 = arith.subf %tmp10_68, %tmp9_67 : tensor<1024xf32> loc(#loc125) + %tmp11 = arith.constant 0.000000e+00 : f32 loc(#loc126) + %tmp11_70 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc126) + %tmp12 = arith.select %tmp8_44, %tmp10_69, %tmp11_70 : tensor<1024xi1>, tensor<1024xf32> loc(#loc127) + %tmp13 = arith.extsi %tmp4_38 : tensor<1024xi32> to tensor<1024xi64> loc(#loc128) + %tmp13_71 = arith.constant dense<1> : tensor<1024xi64> loc(#loc128) + %tmp13_72 = arith.cmpi sge, %tmp13, %tmp13_71 : tensor<1024xi64> loc(#loc128) + %tmp14 = arith.constant 2 : i64 loc(#loc129) + %tmp14_73 = arith.constant dense<2> : tensor<1xi64> loc(#loc129) + %tmp15 = arith.extsi %tmp4_38 : tensor<1024xi32> to tensor<1024xi64> loc(#loc130) + %tmp15_74 = arith.constant dense<2> : tensor<1024xi64> loc(#loc130) + %tmp15_75 = arith.cmpi slt, %tmp15, %tmp15_74 : tensor<1024xi64> loc(#loc130) + %tmp16 = arith.constant 2 : i32 loc(#loc131) + %tmp16_76 = arith.constant 2 : i32 loc(#loc131) + %tmp16_77 = arith.constant dense<2> : tensor<1024xi32> loc(#loc131) + %tmp16_78 = arith.divsi %x0_9, %tmp16_77 : tensor<1024xi32> loc(#loc131) + %tmp16_79 = arith.constant 2 : i32 loc(#loc132) + %tmp16_80 = arith.constant 2 : i32 loc(#loc132) + %tmp16_81 = arith.constant dense<2> : tensor<1024xi32> loc(#loc132) + %tmp16_82 = arith.muli %tmp16_81, %tmp16_78 : tensor<1024xi32> loc(#loc132) + %tmp16_83 = arith.constant 128 : i32 loc(#loc133) + %tmp16_84 = arith.constant 128 : i32 loc(#loc133) + %tmp16_85 = arith.constant dense<128> : tensor<1024xi32> loc(#loc133) + %tmp16_86 = arith.muli %tmp16_85, %x4_15 : tensor<1024xi32> loc(#loc133) + %tmp16_87 = arith.addi %tmp16_82, %tmp16_86 : tensor<1024xi32> loc(#loc134) + %tmp16_88 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc135) + %tmp16_89 = tt.addptr %tmp16_88, %tmp16_87 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc135) + %tmp16_90 = arith.constant 0.000000e+00 : f32 loc(#loc136) + %tmp16_91 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc136) + %tmp16_92 = arith.truncf %tmp16_91 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc136) + %tmp16_93 = tt.load %tmp16_89, %tmp13_72, %tmp16_92 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc136) + %tmp16_94 = arith.extf %tmp16_93 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc137) + %tmp17 = arith.select %tmp8_44, %tmp12, %tmp16_94 : tensor<1024xi1>, tensor<1024xf32> loc(#loc138) + %tmp20 = arith.mulf %tmp17, %tmp19_32 : tensor<1024xf32> loc(#loc139) + %tmp21 = arith.addf %tmp3, %tmp20 : tensor<1024xf32> loc(#loc140) + %tmp25 = arith.mulf %tmp23_35, %tmp2_25 : tensor<1024xf32> loc(#loc141) + %tmp26 = arith.constant 2 : i32 loc(#loc142) + %tmp26_95 = arith.constant 2 : i32 loc(#loc142) + %tmp26_96 = arith.constant dense<2> : tensor<1024xi32> loc(#loc142) + %tmp26_97 = arith.divsi %x0_9, %tmp26_96 : tensor<1024xi32> loc(#loc142) + %tmp26_98 = arith.constant 2 : i32 loc(#loc143) + %tmp26_99 = arith.constant 2 : i32 loc(#loc143) + %tmp26_100 = arith.constant dense<2> : tensor<1024xi32> loc(#loc143) + %tmp26_101 = arith.muli %tmp26_100, %tmp26_97 : tensor<1024xi32> loc(#loc143) + %tmp26_102 = arith.constant 1 : i32 loc(#loc144) + %tmp26_103 = arith.constant 1 : i32 loc(#loc144) + %tmp26_104 = arith.constant dense<1> : tensor<1024xi32> loc(#loc144) + %tmp26_105 = arith.addi %tmp26_104, %tmp26_101 : tensor<1024xi32> loc(#loc144) + %tmp26_106 = arith.constant 128 : i32 loc(#loc145) + %tmp26_107 = arith.constant 128 : i32 loc(#loc145) + %tmp26_108 = arith.constant dense<128> : tensor<1024xi32> loc(#loc145) + %tmp26_109 = arith.muli %tmp26_108, %x4_15 : tensor<1024xi32> loc(#loc145) + %tmp26_110 = arith.addi %tmp26_105, %tmp26_109 : tensor<1024xi32> loc(#loc146) + %tmp26_111 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc147) + %tmp26_112 = tt.addptr %tmp26_111, %tmp26_110 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc147) + %tmp26_113 = arith.constant 0.000000e+00 : f32 loc(#loc148) + %tmp26_114 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc148) + %tmp26_115 = arith.truncf %tmp26_114 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc148) + %tmp26_116 = tt.load %tmp26_112, %tmp8_44, %tmp26_115 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc148) + %tmp26_117 = arith.extf %tmp26_116 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc149) + %tmp27 = arith.constant 0.000000e+00 : f32 loc(#loc150) + %tmp27_118 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc150) + %tmp27_119 = arith.subf %tmp27_118, %tmp26_117 : tensor<1024xf32> loc(#loc150) + %tmp28 = arith.constant 0.000000e+00 : f32 loc(#loc151) + %tmp28_120 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc151) + %tmp29 = arith.select %tmp8_44, %tmp27_119, %tmp28_120 : tensor<1024xi1>, tensor<1024xf32> loc(#loc152) + %tmp30 = arith.constant 2 : i32 loc(#loc153) + %tmp30_121 = arith.constant 2 : i32 loc(#loc153) + %tmp30_122 = arith.constant dense<2> : tensor<1024xi32> loc(#loc153) + %tmp30_123 = arith.divsi %x0_9, %tmp30_122 : tensor<1024xi32> loc(#loc153) + %tmp30_124 = arith.constant 2 : i32 loc(#loc154) + %tmp30_125 = arith.constant 2 : i32 loc(#loc154) + %tmp30_126 = arith.constant dense<2> : tensor<1024xi32> loc(#loc154) + %tmp30_127 = arith.muli %tmp30_126, %tmp30_123 : tensor<1024xi32> loc(#loc154) + %tmp30_128 = arith.constant 128 : i32 loc(#loc155) + %tmp30_129 = arith.constant 128 : i32 loc(#loc155) + %tmp30_130 = arith.constant dense<128> : tensor<1024xi32> loc(#loc155) + %tmp30_131 = arith.muli %tmp30_130, %x4_15 : tensor<1024xi32> loc(#loc155) + %tmp30_132 = arith.addi %tmp30_127, %tmp30_131 : tensor<1024xi32> loc(#loc156) + %tmp30_133 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc157) + %tmp30_134 = tt.addptr %tmp30_133, %tmp30_132 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc157) + %tmp30_135 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp30_136 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc158) + %tmp30_137 = arith.truncf %tmp30_136 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc158) + %tmp30_138 = tt.load %tmp30_134, %tmp13_72, %tmp30_137 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc158) + %tmp30_139 = arith.extf %tmp30_138 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc159) + %tmp31 = arith.select %tmp8_44, %tmp29, %tmp30_139 : tensor<1024xi1>, tensor<1024xf32> loc(#loc160) + %tmp33 = arith.mulf %tmp31, %tmp19_32 : tensor<1024xf32> loc(#loc161) + %tmp34 = arith.addf %tmp25, %tmp33 : tensor<1024xf32> loc(#loc162) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc76) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc76) + %2 = arith.truncf %tmp21 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc77) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc77) + %3 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc78) + %4 = tt.addptr %3, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc78) + %5 = arith.truncf %tmp34 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc79) + tt.store %4, %5 : tensor<1024x!tt.ptr> loc(#loc79) + tt.return loc(#loc80) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:40) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:36) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":34:27) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":35:19) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":36:27) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":40:38) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":43:28) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":44:19) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:40) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:34) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:49) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:44) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:38) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:35) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:53) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:49) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":55:38) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:40) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:34) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:49) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:45) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4) +#loc88 = loc("xnumel"(#loc1)) +#loc89 = loc("xoffset"(#loc2)) +#loc90 = loc("xoffset"(#loc3)) +#loc91 = loc("xindex"(#loc4)) +#loc92 = loc("xindex"(#loc5)) +#loc93 = loc("xmask"(#loc6)) +#loc94 = loc("x0"(#loc7)) +#loc95 = loc("x2"(#loc8)) +#loc96 = loc("x4"(#loc9)) +#loc97 = loc("tmp0"(#loc10)) +#loc98 = loc("tmp0"(#loc11)) +#loc99 = loc("tmp0"(#loc12)) +#loc100 = loc("tmp2"(#loc13)) +#loc101 = loc("tmp2"(#loc14)) +#loc102 = loc("tmp2"(#loc15)) +#loc103 = loc("tmp2"(#loc16)) +#loc104 = loc("tmp19"(#loc17)) +#loc105 = loc("tmp19"(#loc18)) +#loc106 = loc("tmp19"(#loc19)) +#loc107 = loc("tmp19"(#loc20)) +#loc108 = loc("tmp23"(#loc21)) +#loc109 = loc("tmp23"(#loc22)) +#loc110 = loc("tmp23"(#loc23)) +#loc111 = loc("tmp3"(#loc24)) +#loc112 = loc("tmp4"(#loc25)) +#loc113 = loc("tmp5"(#loc26)) +#loc114 = loc("tmp6"(#loc27)) +#loc115 = loc("tmp7"(#loc28)) +#loc116 = loc("tmp8"(#loc29)) +#loc117 = loc("tmp9"(#loc30)) +#loc118 = loc("tmp9"(#loc31)) +#loc119 = loc("tmp9"(#loc32)) +#loc120 = loc("tmp9"(#loc33)) +#loc121 = loc("tmp9"(#loc34)) +#loc122 = loc("tmp9"(#loc35)) +#loc123 = loc("tmp9"(#loc36)) +#loc124 = loc("tmp9"(#loc37)) +#loc125 = loc("tmp10"(#loc38)) +#loc126 = loc("tmp11"(#loc39)) +#loc127 = loc("tmp12"(#loc40)) +#loc128 = loc("tmp13"(#loc41)) +#loc129 = loc("tmp14"(#loc42)) +#loc130 = loc("tmp15"(#loc43)) +#loc131 = loc("tmp16"(#loc44)) +#loc132 = loc("tmp16"(#loc45)) +#loc133 = loc("tmp16"(#loc46)) +#loc134 = loc("tmp16"(#loc47)) +#loc135 = loc("tmp16"(#loc48)) +#loc136 = loc("tmp16"(#loc49)) +#loc137 = loc("tmp16"(#loc50)) +#loc138 = loc("tmp17"(#loc51)) +#loc139 = loc("tmp20"(#loc52)) +#loc140 = loc("tmp21"(#loc53)) +#loc141 = loc("tmp25"(#loc54)) +#loc142 = loc("tmp26"(#loc55)) +#loc143 = loc("tmp26"(#loc56)) +#loc144 = loc("tmp26"(#loc57)) +#loc145 = loc("tmp26"(#loc58)) +#loc146 = loc("tmp26"(#loc59)) +#loc147 = loc("tmp26"(#loc60)) +#loc148 = loc("tmp26"(#loc61)) +#loc149 = loc("tmp26"(#loc62)) +#loc150 = loc("tmp27"(#loc63)) +#loc151 = loc("tmp28"(#loc64)) +#loc152 = loc("tmp29"(#loc65)) +#loc153 = loc("tmp30"(#loc66)) +#loc154 = loc("tmp30"(#loc67)) +#loc155 = loc("tmp30"(#loc68)) +#loc156 = loc("tmp30"(#loc69)) +#loc157 = loc("tmp30"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp30"(#loc72)) +#loc160 = loc("tmp31"(#loc73)) +#loc161 = loc("tmp33"(#loc74)) +#loc162 = loc("tmp34"(#loc75)) diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..bbe4a8c447e36bdf1be8ed09d8a9bf9b0f38dc86 --- /dev/null +++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir @@ -0,0 +1,198 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("in_ptr1"(#loc)) +#loc61 = loc("in_ptr2"(#loc)) +#loc62 = loc("in_ptr3"(#loc)) +#loc63 = loc("out_ptr0"(#loc)) +#loc64 = loc("out_ptr1"(#loc)) +#loc65 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1024xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<2> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1024xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc66) + %xoffset_6 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc67) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc68) + %xindex_7 = tt.splat %xoffset_6 : i32 -> tensor<1024xi32, #blocked> loc(#loc69) + %xindex_8 = arith.addi %xindex_7, %xindex : tensor<1024xi32, #blocked> loc(#loc69) + %x0 = arith.remsi %xindex_8, %cst_3 : tensor<1024xi32, #blocked> loc(#loc70) + %x2 = arith.divsi %xindex_8, %cst_2 : tensor<1024xi32, #blocked> loc(#loc71) + %x4 = arith.divsi %xindex_8, %cst_3 : tensor<1024xi32, #blocked> loc(#loc72) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc73) + %tmp0_9 = tt.addptr %tmp0, %xindex_8 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc73) + %tmp0_10 = tt.load %tmp0_9 : tensor<1024x!tt.ptr, #blocked> loc(#loc74) + %tmp0_11 = arith.extf %tmp0_10 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc75) + %tmp2 = arith.muli %x2, %cst_3 : tensor<1024xi32, #blocked> loc(#loc76) + %tmp2_12 = arith.addi %x0, %tmp2 : tensor<1024xi32, #blocked> loc(#loc77) + %tmp2_13 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc78) + %tmp2_14 = tt.addptr %tmp2_13, %tmp2_12 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc78) + %tmp2_15 = tt.load %tmp2_14 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc79) + %tmp19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc80) + %tmp19_16 = tt.addptr %tmp19, %tmp2_12 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc80) + %tmp19_17 = tt.load %tmp19_16 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc81) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc82) + %tmp23_18 = tt.addptr %tmp23, %xindex_8 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc82) + %tmp23_19 = tt.load %tmp23_18 : tensor<1024x!tt.ptr, #blocked> loc(#loc83) + %tmp23_20 = arith.extf %tmp23_19 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc84) + %tmp3 = arith.mulf %tmp0_11, %tmp2_15 : tensor<1024xf32, #blocked> loc(#loc85) + %tmp4 = arith.remsi %xindex_8, %cst_1 : tensor<1024xi32, #blocked> loc(#loc86) + %tmp8 = arith.extsi %tmp4 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc87) + %tmp8_21 = arith.cmpi slt, %tmp8, %cst_0 : tensor<1024xi64, #blocked> loc(#loc87) + %tmp9 = arith.divsi %x0, %cst_1 : tensor<1024xi32, #blocked> loc(#loc88) + %tmp9_22 = arith.muli %tmp9, %cst_1 : tensor<1024xi32, #blocked> loc(#loc89) + %tmp9_23 = arith.addi %tmp9_22, %cst : tensor<1024xi32, #blocked> loc(#loc90) + %tmp9_24 = arith.muli %x4, %cst_3 : tensor<1024xi32, #blocked> loc(#loc91) + %tmp9_25 = arith.addi %tmp9_23, %tmp9_24 : tensor<1024xi32, #blocked> loc(#loc92) + %tmp9_26 = tt.addptr %tmp0, %tmp9_25 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc93) + %tmp9_27 = tt.load %tmp9_26, %tmp8_21, %cst_4 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc94) + %tmp9_28 = arith.extf %tmp9_27 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc95) + %tmp10 = arith.subf %cst_5, %tmp9_28 : tensor<1024xf32, #blocked> loc(#loc96) + %tmp13 = arith.cmpi sge, %tmp8, %cst_0 : tensor<1024xi64, #blocked> loc(#loc97) + %tmp16 = arith.addi %tmp9_22, %tmp9_24 : tensor<1024xi32, #blocked> loc(#loc98) + %tmp16_29 = tt.addptr %tmp0, %tmp16 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc99) + %tmp16_30 = tt.load %tmp16_29, %tmp13, %cst_4 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc100) + %tmp16_31 = arith.extf %tmp16_30 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc101) + %tmp17 = arith.select %tmp8_21, %tmp10, %tmp16_31 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc118) + %tmp20 = arith.mulf %tmp17, %tmp19_17 : tensor<1024xf32, #blocked> loc(#loc104) + %tmp21 = arith.addf %tmp3, %tmp20 : tensor<1024xf32, #blocked> loc(#loc105) + %tmp25 = arith.mulf %tmp23_20, %tmp2_15 : tensor<1024xf32, #blocked> loc(#loc106) + %tmp26 = tt.addptr %tmp23, %tmp9_25 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc107) + %tmp26_32 = tt.load %tmp26, %tmp8_21, %cst_4 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc108) + %tmp26_33 = arith.extf %tmp26_32 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc109) + %tmp27 = arith.subf %cst_5, %tmp26_33 : tensor<1024xf32, #blocked> loc(#loc110) + %tmp30 = tt.addptr %tmp23, %tmp16 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc111) + %tmp30_34 = tt.load %tmp30, %tmp13, %cst_4 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc112) + %tmp30_35 = arith.extf %tmp30_34 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc113) + %tmp31 = arith.select %tmp8_21, %tmp27, %tmp30_35 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc119) + %tmp33 = arith.mulf %tmp31, %tmp19_17 : tensor<1024xf32, #blocked> loc(#loc116) + %tmp34 = arith.addf %tmp25, %tmp33 : tensor<1024xf32, #blocked> loc(#loc117) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc54) + %1 = tt.addptr %0, %xindex_8 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc54) + %2 = arith.truncf %tmp21 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc55) + tt.store %1, %2 : tensor<1024x!tt.ptr, #blocked> loc(#loc55) + %3 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc56) + %4 = tt.addptr %3, %xindex_8 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc56) + %5 = arith.truncf %tmp34 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc57) + tt.store %4, %5 : tensor<1024x!tt.ptr, #blocked> loc(#loc57) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4) +#loc66 = loc("xoffset"(#loc2)) +#loc67 = loc("xoffset"(#loc3)) +#loc68 = loc("xindex"(#loc4)) +#loc69 = loc("xindex"(#loc5)) +#loc70 = loc("x0"(#loc6)) +#loc71 = loc("x2"(#loc7)) +#loc72 = loc("x4"(#loc8)) +#loc73 = loc("tmp0"(#loc9)) +#loc74 = loc("tmp0"(#loc10)) +#loc75 = loc("tmp0"(#loc11)) +#loc76 = loc("tmp2"(#loc12)) +#loc77 = loc("tmp2"(#loc13)) +#loc78 = loc("tmp2"(#loc14)) +#loc79 = loc("tmp2"(#loc15)) +#loc80 = loc("tmp19"(#loc16)) +#loc81 = loc("tmp19"(#loc17)) +#loc82 = loc("tmp23"(#loc18)) +#loc83 = loc("tmp23"(#loc19)) +#loc84 = loc("tmp23"(#loc20)) +#loc85 = loc("tmp3"(#loc21)) +#loc86 = loc("tmp4"(#loc22)) +#loc87 = loc("tmp8"(#loc23)) +#loc88 = loc("tmp9"(#loc24)) +#loc89 = loc("tmp9"(#loc25)) +#loc90 = loc("tmp9"(#loc26)) +#loc91 = loc("tmp9"(#loc27)) +#loc92 = loc("tmp9"(#loc28)) +#loc93 = loc("tmp9"(#loc29)) +#loc94 = loc("tmp9"(#loc30)) +#loc95 = loc("tmp9"(#loc31)) +#loc96 = loc("tmp10"(#loc32)) +#loc97 = loc("tmp13"(#loc33)) +#loc98 = loc("tmp16"(#loc34)) +#loc99 = loc("tmp16"(#loc35)) +#loc100 = loc("tmp16"(#loc36)) +#loc101 = loc("tmp16"(#loc37)) +#loc102 = loc("tmp17"(#loc38)) +#loc103 = loc("tmp12"(#loc39)) +#loc104 = loc("tmp20"(#loc40)) +#loc105 = loc("tmp21"(#loc41)) +#loc106 = loc("tmp25"(#loc42)) +#loc107 = loc("tmp26"(#loc43)) +#loc108 = loc("tmp26"(#loc44)) +#loc109 = loc("tmp26"(#loc45)) +#loc110 = loc("tmp27"(#loc46)) +#loc111 = loc("tmp30"(#loc47)) +#loc112 = loc("tmp30"(#loc48)) +#loc113 = loc("tmp30"(#loc49)) +#loc114 = loc("tmp31"(#loc50)) +#loc115 = loc("tmp29"(#loc51)) +#loc116 = loc("tmp33"(#loc52)) +#loc117 = loc("tmp34"(#loc53)) +#loc118 = loc(fused[#loc102, #loc103]) +#loc119 = loc(fused[#loc114, #loc115]) diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..cd4a7ecd31b77172537a9e78b74d70737000e896 --- /dev/null +++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir @@ -0,0 +1,197 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("in_ptr1"(#loc)) +#loc61 = loc("in_ptr2"(#loc)) +#loc62 = loc("in_ptr3"(#loc)) +#loc63 = loc("out_ptr0"(#loc)) +#loc64 = loc("out_ptr1"(#loc)) +#loc65 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1024xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1024xi32> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1024xi64> loc(#loc1) + %cst_3 = arith.constant dense<2> : tensor<1024xi32> loc(#loc1) + %x2 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc66) + %cst_4 = arith.constant dense<128> : tensor<1024xi32> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc67) + %xoffset_5 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc68) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc69) + %xindex_6 = tt.splat %xoffset_5 : i32 -> tensor<1024xi32> loc(#loc70) + %xindex_7 = arith.addi %xindex_6, %xindex : tensor<1024xi32> loc(#loc70) + %x0 = arith.remsi %xindex_7, %cst_4 : tensor<1024xi32> loc(#loc71) + %x2_8 = arith.divsi %xindex_7, %x2 : tensor<1024xi32> loc(#loc66) + %x4 = arith.divsi %xindex_7, %cst_4 : tensor<1024xi32> loc(#loc72) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc73) + %tmp0_9 = tt.addptr %tmp0, %xindex_7 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc73) + %tmp0_10 = tt.load %tmp0_9 : tensor<1024x!tt.ptr> loc(#loc74) + %tmp0_11 = arith.extf %tmp0_10 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc75) + %tmp2 = arith.muli %x2_8, %cst_4 : tensor<1024xi32> loc(#loc76) + %tmp2_12 = arith.addi %x0, %tmp2 : tensor<1024xi32> loc(#loc77) + %tmp2_13 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc78) + %tmp2_14 = tt.addptr %tmp2_13, %tmp2_12 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc78) + %tmp2_15 = tt.load %tmp2_14 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc79) + %tmp19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc80) + %tmp19_16 = tt.addptr %tmp19, %tmp2_12 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc80) + %tmp19_17 = tt.load %tmp19_16 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc81) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc82) + %tmp23_18 = tt.addptr %tmp23, %xindex_7 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc82) + %tmp23_19 = tt.load %tmp23_18 : tensor<1024x!tt.ptr> loc(#loc83) + %tmp23_20 = arith.extf %tmp23_19 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc84) + %tmp3 = arith.mulf %tmp0_11, %tmp2_15 : tensor<1024xf32> loc(#loc85) + %tmp4 = arith.remsi %xindex_7, %cst_3 : tensor<1024xi32> loc(#loc86) + %tmp8 = arith.extsi %tmp4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc87) + %tmp8_21 = arith.cmpi slt, %tmp8, %cst_2 : tensor<1024xi64> loc(#loc87) + %tmp9 = arith.divsi %x0, %cst_3 : tensor<1024xi32> loc(#loc88) + %tmp9_22 = arith.muli %tmp9, %cst_3 : tensor<1024xi32> loc(#loc89) + %tmp9_23 = arith.addi %tmp9_22, %cst_1 : tensor<1024xi32> loc(#loc90) + %tmp9_24 = arith.muli %x4, %cst_4 : tensor<1024xi32> loc(#loc91) + %tmp9_25 = arith.addi %tmp9_23, %tmp9_24 : tensor<1024xi32> loc(#loc92) + %tmp9_26 = tt.addptr %tmp0, %tmp9_25 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc93) + %tmp9_27 = tt.load %tmp9_26, %tmp8_21, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc94) + %tmp9_28 = arith.extf %tmp9_27 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc95) + %tmp10 = arith.subf %cst_0, %tmp9_28 : tensor<1024xf32> loc(#loc96) + %tmp13 = arith.cmpi sge, %tmp8, %cst_2 : tensor<1024xi64> loc(#loc97) + %tmp16 = arith.addi %tmp9_22, %tmp9_24 : tensor<1024xi32> loc(#loc98) + %tmp16_29 = tt.addptr %tmp0, %tmp16 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc99) + %tmp16_30 = tt.load %tmp16_29, %tmp13, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc100) + %tmp16_31 = arith.extf %tmp16_30 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc101) + %tmp17 = arith.select %tmp8_21, %tmp10, %tmp16_31 : tensor<1024xi1>, tensor<1024xf32> loc(#loc118) + %tmp20 = arith.mulf %tmp17, %tmp19_17 : tensor<1024xf32> loc(#loc104) + %tmp21 = arith.addf %tmp3, %tmp20 : tensor<1024xf32> loc(#loc105) + %tmp25 = arith.mulf %tmp23_20, %tmp2_15 : tensor<1024xf32> loc(#loc106) + %tmp26 = tt.addptr %tmp23, %tmp9_25 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc107) + %tmp26_32 = tt.load %tmp26, %tmp8_21, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc108) + %tmp26_33 = arith.extf %tmp26_32 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc109) + %tmp27 = arith.subf %cst_0, %tmp26_33 : tensor<1024xf32> loc(#loc110) + %tmp30 = tt.addptr %tmp23, %tmp16 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc111) + %tmp30_34 = tt.load %tmp30, %tmp13, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc112) + %tmp30_35 = arith.extf %tmp30_34 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc113) + %tmp31 = arith.select %tmp8_21, %tmp27, %tmp30_35 : tensor<1024xi1>, tensor<1024xf32> loc(#loc119) + %tmp33 = arith.mulf %tmp31, %tmp19_17 : tensor<1024xf32> loc(#loc116) + %tmp34 = arith.addf %tmp25, %tmp33 : tensor<1024xf32> loc(#loc117) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc54) + %1 = tt.addptr %0, %xindex_7 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc54) + %2 = arith.truncf %tmp21 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc55) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc55) + %3 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc56) + %4 = tt.addptr %3, %xindex_7 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc56) + %5 = arith.truncf %tmp34 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc57) + tt.store %4, %5 : tensor<1024x!tt.ptr> loc(#loc57) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4) +#loc66 = loc("x2"(#loc2)) +#loc67 = loc("xoffset"(#loc3)) +#loc68 = loc("xoffset"(#loc4)) +#loc69 = loc("xindex"(#loc5)) +#loc70 = loc("xindex"(#loc6)) +#loc71 = loc("x0"(#loc7)) +#loc72 = loc("x4"(#loc8)) +#loc73 = loc("tmp0"(#loc9)) +#loc74 = loc("tmp0"(#loc10)) +#loc75 = loc("tmp0"(#loc11)) +#loc76 = loc("tmp2"(#loc12)) +#loc77 = loc("tmp2"(#loc13)) +#loc78 = loc("tmp2"(#loc14)) +#loc79 = loc("tmp2"(#loc15)) +#loc80 = loc("tmp19"(#loc16)) +#loc81 = loc("tmp19"(#loc17)) +#loc82 = loc("tmp23"(#loc18)) +#loc83 = loc("tmp23"(#loc19)) +#loc84 = loc("tmp23"(#loc20)) +#loc85 = loc("tmp3"(#loc21)) +#loc86 = loc("tmp4"(#loc22)) +#loc87 = loc("tmp8"(#loc23)) +#loc88 = loc("tmp9"(#loc24)) +#loc89 = loc("tmp9"(#loc25)) +#loc90 = loc("tmp9"(#loc26)) +#loc91 = loc("tmp9"(#loc27)) +#loc92 = loc("tmp9"(#loc28)) +#loc93 = loc("tmp9"(#loc29)) +#loc94 = loc("tmp9"(#loc30)) +#loc95 = loc("tmp9"(#loc31)) +#loc96 = loc("tmp10"(#loc32)) +#loc97 = loc("tmp13"(#loc33)) +#loc98 = loc("tmp16"(#loc34)) +#loc99 = loc("tmp16"(#loc35)) +#loc100 = loc("tmp16"(#loc36)) +#loc101 = loc("tmp16"(#loc37)) +#loc102 = loc("tmp17"(#loc38)) +#loc103 = loc("tmp12"(#loc39)) +#loc104 = loc("tmp20"(#loc40)) +#loc105 = loc("tmp21"(#loc41)) +#loc106 = loc("tmp25"(#loc42)) +#loc107 = loc("tmp26"(#loc43)) +#loc108 = loc("tmp26"(#loc44)) +#loc109 = loc("tmp26"(#loc45)) +#loc110 = loc("tmp27"(#loc46)) +#loc111 = loc("tmp30"(#loc47)) +#loc112 = loc("tmp30"(#loc48)) +#loc113 = loc("tmp30"(#loc49)) +#loc114 = loc("tmp31"(#loc50)) +#loc115 = loc("tmp29"(#loc51)) +#loc116 = loc("tmp33"(#loc52)) +#loc117 = loc("tmp34"(#loc53)) +#loc118 = loc(fused[#loc102, #loc103]) +#loc119 = loc(fused[#loc114, #loc115]) diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/__grp__triton_poi_fused_clone_permute_2.json b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/__grp__triton_poi_fused_clone_permute_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cc46aaaa092b53462e08a66301f0317fdac25dee --- /dev/null +++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/__grp__triton_poi_fused_clone_permute_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_clone_permute_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.source", "triton_poi_fused_clone_permute_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttir", "triton_poi_fused_clone_permute_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttgir", "triton_poi_fused_clone_permute_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.llir", "triton_poi_fused_clone_permute_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ptx", "triton_poi_fused_clone_permute_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.cubin", "triton_poi_fused_clone_permute_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.json"}} \ No newline at end of file diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.cubin b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..acf9275f00eb73587ac18e978fe7d2b8ba03b621 Binary files /dev/null and b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.cubin differ diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.json b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5266b1dfb39ff9c47ac5a70fca3cc7b5ade5cabc --- /dev/null +++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.json @@ -0,0 +1 @@ +{"hash": "7d52522a872e3d81bffc5c7b96ffbdbbe1a6b92e897033bf00204142efef4a53", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_permute_2"} \ No newline at end of file diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.llir b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..7a54ef9cf8343e9c92f1b0469c29b2a2bf75c9fb --- /dev/null +++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.llir @@ -0,0 +1,67 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_clone_permute_2(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 9, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 1, !dbg !9 + %10 = and i32 %9, 510, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = sdiv i32 %11, 128, !dbg !11 + %13 = mul i32 %12, 128, !dbg !12 + %.decomposed = sub i32 %11, %13, !dbg !12 + %14 = srem i32 %12, 32, !dbg !13 + %15 = sdiv i32 %11, 4096, !dbg !14 + %16 = shl nsw i32 %15, 7, !dbg !15 + %17 = add nsw i32 %16, %.decomposed, !dbg !16 + %18 = mul nsw i32 %14, 294912, !dbg !17 + %19 = add nsw i32 %17, %18, !dbg !18 + %20 = sext i32 %19 to i64, !dbg !19 + %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19 + %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %21) #2, !dbg !20 + %23 = sext i32 %11 to i64, !dbg !21 + %24 = getelementptr bfloat, ptr addrspace(1) %1, i64 %23, !dbg !21 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %22, ptr addrspace(1) %24) #2, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_permute_2", linkageName: "triton_poi_fused_clone_permute_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 23, column: 19, scope: !4) +!13 = !DILocation(line: 24, column: 28, scope: !4) +!14 = !DILocation(line: 25, column: 19, scope: !4) +!15 = !DILocation(line: 27, column: 39, scope: !4) +!16 = !DILocation(line: 27, column: 35, scope: !4) +!17 = !DILocation(line: 27, column: 51, scope: !4) +!18 = !DILocation(line: 27, column: 44, scope: !4) +!19 = !DILocation(line: 27, column: 30, scope: !4) +!20 = !DILocation(line: 27, column: 56, scope: !4) +!21 = !DILocation(line: 28, column: 25, scope: !4) +!22 = !DILocation(line: 28, column: 36, scope: !4) +!23 = !DILocation(line: 28, column: 4, scope: !4) diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ptx b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..10a6493b491e7870ac033b725875e9e9bad66953 --- /dev/null +++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ptx @@ -0,0 +1,324 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_clone_permute_2 // -- Begin function triton_poi_fused_clone_permute_2 + // @triton_poi_fused_clone_permute_2 +.visible .entry triton_poi_fused_clone_permute_2( + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_1, + .param .u32 triton_poi_fused_clone_permute_2_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_4 +) +.reqntid 256 +{ + .reg .b32 %r<24>; + .reg .b64 %rd<5>; + .loc 1 18 0 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_poi_fused_clone_permute_2_param_0]; + ld.param.b64 %rd4, [triton_poi_fused_clone_permute_2_param_1]; +$L__tmp0: + .loc 1 20 28 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:20:28 + mov.u32 %r2, %ctaid.x; + .loc 1 20 33 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:20:33 + shl.b32 %r3, %r2, 9; + .loc 1 21 36 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:21:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 21 23 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:21:23 + or.b32 %r7, %r6, %r3; + .loc 1 24 21 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:24:21 + bfe.s32 %r8, %r2, 22, 1; + shr.u32 %r9, %r8, 25; + add.s32 %r10, %r7, %r9; + shr.s32 %r11, %r10, 7; + .loc 1 23 19 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:23:19 + and.b32 %r12, %r10, -128; + sub.s32 %r13, %r7, %r12; + .loc 1 24 28 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:24:28 + shr.u32 %r14, %r11, 27; + add.s32 %r15, %r11, %r14; + and.b32 %r16, %r15, 131040; + sub.s32 %r17, %r11, %r16; + .loc 1 25 19 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:25:19 + shr.u32 %r18, %r8, 20; + add.s32 %r19, %r7, %r18; + shr.s32 %r20, %r19, 12; + .loc 1 27 39 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:39 + shl.b32 %r21, %r20, 7; + .loc 1 27 35 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:35 + add.s32 %r22, %r21, %r13; + .loc 1 27 44 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:44 + mad.lo.s32 %r23, %r17, 294912, %r22; + .loc 1 27 30 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:30 + mad.wide.s32 %rd1, %r23, 2, %rd3; + .loc 1 27 56 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:56 + // begin inline asm + mov.u32 %r1, 0x0; + ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 28 25 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:25 + mad.wide.s32 %rd2, %r7, 2, %rd4; + .loc 1 28 36 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:36 + // begin inline asm + st.global.b32 [ %rd2 + 0 ], { %r1 }; + // end inline asm + .loc 1 28 4 // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 106 +.b8 52 +.b8 55 +.b8 118 +.b8 122 +.b8 50 +.b8 117 +.b8 55 +.b8 105 +.b8 51 +.b8 116 +.b8 104 +.b8 53 +.b8 51 +.b8 99 +.b8 102 +.b8 50 +.b8 101 +.b8 108 +.b8 99 +.b8 53 +.b8 102 +.b8 105 +.b8 121 +.b8 108 +.b8 118 +.b8 121 +.b8 107 +.b8 55 +.b8 111 +.b8 51 +.b8 110 +.b8 105 +.b8 50 +.b8 112 +.b8 110 +.b8 52 +.b8 99 +.b8 50 +.b8 98 +.b8 100 +.b8 100 +.b8 114 +.b8 122 +.b8 113 +.b8 53 +.b8 106 +.b8 110 +.b8 117 +.b8 110 +.b8 113 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 106 +.b8 52 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.source b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.source new file mode 100644 index 0000000000000000000000000000000000000000..fb39d548a310881d8db46da35c1ed42ee766abb4 --- /dev/null +++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.source @@ -0,0 +1,90 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("out_ptr0"(#loc)) +#loc23 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_1 = arith.constant 512 : i32 loc(#loc26) + %xoffset_2 = arith.constant 512 : i32 loc(#loc26) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc28) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc28) + %xmask = arith.constant true loc(#loc29) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc29) + %x0 = arith.constant 128 : i32 loc(#loc30) + %x0_7 = arith.constant 128 : i32 loc(#loc30) + %x0_8 = arith.constant dense<128> : tensor<512xi32> loc(#loc30) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc30) + %x1 = arith.constant 128 : i32 loc(#loc31) + %x1_10 = arith.constant 128 : i32 loc(#loc31) + %x1_11 = arith.constant dense<128> : tensor<512xi32> loc(#loc31) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc31) + %x1_13 = arith.constant 32 : i32 loc(#loc32) + %x1_14 = arith.constant 32 : i32 loc(#loc32) + %x1_15 = arith.constant dense<32> : tensor<512xi32> loc(#loc32) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<512xi32> loc(#loc32) + %x2 = arith.constant 4096 : i32 loc(#loc33) + %x2_17 = arith.constant 4096 : i32 loc(#loc33) + %x2_18 = arith.constant dense<4096> : tensor<512xi32> loc(#loc33) + %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<512xi32> loc(#loc33) + %tmp0 = arith.constant 128 : i32 loc(#loc34) + %tmp0_20 = arith.constant 128 : i32 loc(#loc34) + %tmp0_21 = arith.constant dense<128> : tensor<512xi32> loc(#loc34) + %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<512xi32> loc(#loc34) + %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<512xi32> loc(#loc35) + %tmp0_24 = arith.constant 294912 : i32 loc(#loc36) + %tmp0_25 = arith.constant 294912 : i32 loc(#loc36) + %tmp0_26 = arith.constant dense<294912> : tensor<512xi32> loc(#loc36) + %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<512xi32> loc(#loc36) + %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<512xi32> loc(#loc37) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc38) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc38) + %tmp0_31 = tt.load %tmp0_30 : tensor<512x!tt.ptr> loc(#loc39) + %tmp0_32 = arith.extf %tmp0_31 : tensor<512xbf16> to tensor<512xf32> loc(#loc40) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc18) + %2 = arith.truncf %tmp0_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc19) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:65) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4) +#loc24 = loc("xnumel"(#loc1)) +#loc25 = loc("xoffset"(#loc2)) +#loc26 = loc("xoffset"(#loc3)) +#loc27 = loc("xindex"(#loc4)) +#loc28 = loc("xindex"(#loc5)) +#loc29 = loc("xmask"(#loc6)) +#loc30 = loc("x0"(#loc7)) +#loc31 = loc("x1"(#loc8)) +#loc32 = loc("x1"(#loc9)) +#loc33 = loc("x2"(#loc10)) +#loc34 = loc("tmp0"(#loc11)) +#loc35 = loc("tmp0"(#loc12)) +#loc36 = loc("tmp0"(#loc13)) +#loc37 = loc("tmp0"(#loc14)) +#loc38 = loc("tmp0"(#loc15)) +#loc39 = loc("tmp0"(#loc16)) +#loc40 = loc("tmp0"(#loc17)) diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttgir b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..cea7be2293112cec854e1647a9bf428d763ceef5 --- /dev/null +++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttgir @@ -0,0 +1,66 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<512xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32> : tensor<512xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<294912> : tensor<512xi32, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc22) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc23) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc24) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc25) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc25) + %x0 = arith.remsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc26) + %x1 = arith.divsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc27) + %x1_6 = arith.remsi %x1, %cst_0 : tensor<512xi32, #blocked> loc(#loc28) + %x2 = arith.divsi %xindex_5, %cst_1 : tensor<512xi32, #blocked> loc(#loc29) + %tmp0 = arith.muli %x2, %cst : tensor<512xi32, #blocked> loc(#loc30) + %tmp0_7 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc31) + %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<512xi32, #blocked> loc(#loc32) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32, #blocked> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr, #blocked> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc16) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<512x!tt.ptr, #blocked> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4) +#loc22 = loc("xoffset"(#loc2)) +#loc23 = loc("xoffset"(#loc3)) +#loc24 = loc("xindex"(#loc4)) +#loc25 = loc("xindex"(#loc5)) +#loc26 = loc("x0"(#loc6)) +#loc27 = loc("x1"(#loc7)) +#loc28 = loc("x1"(#loc8)) +#loc29 = loc("x2"(#loc9)) +#loc30 = loc("tmp0"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttir b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..736e658ece89e46da1fc5785e06cb31d4b9ca20a --- /dev/null +++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttir @@ -0,0 +1,65 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<294912> : tensor<512xi32> loc(#loc22) + %x2 = arith.constant dense<4096> : tensor<512xi32> loc(#loc23) + %x1 = arith.constant dense<32> : tensor<512xi32> loc(#loc24) + %cst = arith.constant dense<128> : tensor<512xi32> loc(#loc4) + %c512_i32 = arith.constant 512 : i32 loc(#loc4) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc26) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc28) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc28) + %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32> loc(#loc29) + %x1_3 = arith.divsi %xindex_2, %cst : tensor<512xi32> loc(#loc30) + %x1_4 = arith.remsi %x1_3, %x1 : tensor<512xi32> loc(#loc24) + %x2_5 = arith.divsi %xindex_2, %x2 : tensor<512xi32> loc(#loc23) + %tmp0_6 = arith.muli %x2_5, %cst : tensor<512xi32> loc(#loc31) + %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<512xi32> loc(#loc32) + %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<512xi32> loc(#loc22) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc16) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<512x!tt.ptr> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28) +#loc4 = loc(unknown) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4) +#loc22 = loc("tmp0"(#loc1)) +#loc23 = loc("x2"(#loc2)) +#loc24 = loc("x1"(#loc3)) +#loc25 = loc("xoffset"(#loc5)) +#loc26 = loc("xoffset"(#loc6)) +#loc27 = loc("xindex"(#loc7)) +#loc28 = loc("xindex"(#loc8)) +#loc29 = loc("x0"(#loc9)) +#loc30 = loc("x1"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/__grp__triton_poi_fused_add_mul_0.json b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/__grp__triton_poi_fused_add_mul_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8413b80802ab7e798a5db50533dc5fd4a187d9c7 --- /dev/null +++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/__grp__triton_poi_fused_add_mul_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_mul_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.source", "triton_poi_fused_add_mul_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttir", "triton_poi_fused_add_mul_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttgir", "triton_poi_fused_add_mul_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.llir", "triton_poi_fused_add_mul_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ptx", "triton_poi_fused_add_mul_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.cubin", "triton_poi_fused_add_mul_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.json"}} \ No newline at end of file diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.cubin b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..0863a2a23203195e02d18b93847ac8b8854102e6 Binary files /dev/null and b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.cubin differ diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.json b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.json new file mode 100644 index 0000000000000000000000000000000000000000..99f87dc7c7d91be60abc7f071d39cdb9a512fa18 --- /dev/null +++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.json @@ -0,0 +1 @@ +{"hash": "87608511e91c4ff29f3e10c2b60fb8e1a66a7b1f402cd2ad284c5a6707f12d79", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_0"} \ No newline at end of file diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.llir b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..e21e9f05be0ab5246db2ad7bf2f4d35695de041e --- /dev/null +++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.llir @@ -0,0 +1,118 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_add_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 10, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = shl nuw nsw i32 %10, 3, !dbg !9 + %12 = and i32 %11, 1016, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = srem i32 %13, 4096, !dbg !11 + %15 = sext i32 %13 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12 + %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %16) #2, !dbg !13 + %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !13 + %19 = bitcast i32 %18 to <2 x bfloat>, !dbg !13 + %20 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !13 + %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !13 + %22 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !13 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !13 + %24 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !13 + %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !13 + %26 = sext i32 %14 to i64, !dbg !14 + %27 = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !14 + %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15 + %29 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %27, i64 %28) #2, !dbg !15 + %30 = extractvalue { i32, i32, i32, i32 } %29, 0, !dbg !15 + %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !15 + %32 = extractvalue { i32, i32, i32, i32 } %29, 1, !dbg !15 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !15 + %34 = extractvalue { i32, i32, i32, i32 } %29, 2, !dbg !15 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !15 + %36 = extractvalue { i32, i32, i32, i32 } %29, 3, !dbg !15 + %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !15 + %38 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16 + %39 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %38) #2, !dbg !17 + %40 = extractvalue { i32, i32, i32, i32 } %39, 0, !dbg !17 + %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !17 + %42 = extractvalue { i32, i32, i32, i32 } %39, 1, !dbg !17 + %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !17 + %44 = extractvalue { i32, i32, i32, i32 } %39, 2, !dbg !17 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !17 + %46 = extractvalue { i32, i32, i32, i32 } %39, 3, !dbg !17 + %47 = bitcast i32 %46 to <2 x bfloat>, !dbg !17 + %48 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18 + %49 = fpext <2 x bfloat> %19 to <2 x float>, !dbg !19 + %50 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !20 + %51 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !21 + %52 = fmul <2 x float> %50, %51, !dbg !22 + %53 = fadd <2 x float> %52, %49, !dbg !23 + %54 = fptrunc <2 x float> %53 to <2 x bfloat>, !dbg !24 + %55 = fpext <2 x bfloat> %21 to <2 x float>, !dbg !19 + %56 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !20 + %57 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !21 + %58 = fmul <2 x float> %56, %57, !dbg !22 + %59 = fadd <2 x float> %58, %55, !dbg !23 + %60 = fptrunc <2 x float> %59 to <2 x bfloat>, !dbg !24 + %61 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !19 + %62 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !20 + %63 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !21 + %64 = fmul <2 x float> %62, %63, !dbg !22 + %65 = fadd <2 x float> %64, %61, !dbg !23 + %66 = fptrunc <2 x float> %65 to <2 x bfloat>, !dbg !24 + %67 = fpext <2 x bfloat> %25 to <2 x float>, !dbg !19 + %68 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !20 + %69 = fpext <2 x bfloat> %47 to <2 x float>, !dbg !21 + %70 = fmul <2 x float> %68, %69, !dbg !22 + %71 = fadd <2 x float> %70, %67, !dbg !23 + %72 = fptrunc <2 x float> %71 to <2 x bfloat>, !dbg !24 + %73 = bitcast <2 x bfloat> %54 to i32, !dbg !24 + %74 = bitcast <2 x bfloat> %60 to i32, !dbg !24 + %75 = bitcast <2 x bfloat> %66 to i32, !dbg !24 + %76 = bitcast <2 x bfloat> %72 to i32, !dbg !24 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %73, i32 %74, i32 %75, i32 %76, ptr addrspace(1) %48) #2, !dbg !24 + ret void, !dbg !25 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_0", linkageName: "triton_poi_fused_add_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 19, scope: !4) +!12 = !DILocation(line: 25, column: 30, scope: !4) +!13 = !DILocation(line: 25, column: 35, scope: !4) +!14 = !DILocation(line: 26, column: 30, scope: !4) +!15 = !DILocation(line: 26, column: 35, scope: !4) +!16 = !DILocation(line: 27, column: 30, scope: !4) +!17 = !DILocation(line: 27, column: 35, scope: !4) +!18 = !DILocation(line: 30, column: 25, scope: !4) +!19 = !DILocation(line: 25, column: 44, scope: !4) +!20 = !DILocation(line: 26, column: 74, scope: !4) +!21 = !DILocation(line: 27, column: 44, scope: !4) +!22 = !DILocation(line: 28, column: 18, scope: !4) +!23 = !DILocation(line: 29, column: 18, scope: !4) +!24 = !DILocation(line: 30, column: 36, scope: !4) +!25 = !DILocation(line: 30, column: 4, scope: !4) diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ptx b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..9fd68f9f34101b13ddf924239071fbcbf373d304 --- /dev/null +++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ptx @@ -0,0 +1,407 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_add_mul_0 // -- Begin function triton_poi_fused_add_mul_0 + // @triton_poi_fused_add_mul_0 +.visible .entry triton_poi_fused_add_mul_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_3, + .param .u32 triton_poi_fused_add_mul_0_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_6 +) +.reqntid 128 +{ + .reg .b16 %rs<25>; + .reg .b32 %r<60>; + .reg .b64 %rd<11>; + .loc 1 18 0 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:18:0 + +// %bb.0: + ld.param.b64 %rd6, [triton_poi_fused_add_mul_0_param_0]; + ld.param.b64 %rd7, [triton_poi_fused_add_mul_0_param_1]; +$L__tmp0: + .loc 1 20 28 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:20:28 + mov.u32 %r17, %ctaid.x; + .loc 1 20 33 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:20:33 + shl.b32 %r18, %r17, 10; + ld.param.b64 %rd8, [triton_poi_fused_add_mul_0_param_2]; + ld.param.b64 %rd9, [triton_poi_fused_add_mul_0_param_3]; + .loc 1 21 36 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:21:36 + mov.u32 %r19, %tid.x; + shl.b32 %r20, %r19, 3; + and.b32 %r21, %r20, 1016; + .loc 1 21 23 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:21:23 + or.b32 %r22, %r21, %r18; + .loc 1 24 19 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:24:19 + bfe.s32 %r23, %r17, 21, 1; + shr.u32 %r24, %r23, 20; + add.s32 %r25, %r22, %r24; + and.b32 %r26, %r25, -4096; + sub.s32 %r27, %r22, %r26; + .loc 1 25 30 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:30 + mul.wide.s32 %rd10, %r22, 2; + add.s64 %rd1, %rd6, %rd10; + .loc 1 25 35 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:35 + // begin inline asm + mov.u32 %r1, 0x0; + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 26 30 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:30 + mad.wide.s32 %rd2, %r27, 2, %rd7; + .loc 1 26 35 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:35 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r5, 0x0; + mov.u32 %r6, 0x0; + mov.u32 %r7, 0x0; + mov.u32 %r8, 0x0; + ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 27 30 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:30 + add.s64 %rd4, %rd8, %rd10; + .loc 1 27 35 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:35 + // begin inline asm + mov.u32 %r9, 0x0; + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + ld.global.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd4 + 0 ]; + // end inline asm + .loc 1 30 25 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:25 + add.s64 %rd5, %rd9, %rd10; + .loc 1 25 44 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:44 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r28, %rs2; + cvt.f32.bf16 %r29, %rs1; + .loc 1 26 74 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:74 + mov.b32 {%rs3, %rs4}, %r5; + cvt.f32.bf16 %r30, %rs4; + cvt.f32.bf16 %r31, %rs3; + .loc 1 27 44 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:44 + mov.b32 {%rs5, %rs6}, %r9; + cvt.f32.bf16 %r32, %rs6; + cvt.f32.bf16 %r33, %rs5; + .loc 1 29 18 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:29:18 + fma.rn.f32 %r34, %r31, %r33, %r29; + fma.rn.f32 %r35, %r30, %r32, %r28; + .loc 1 30 36 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:36 + cvt.rn.bf16x2.f32 %r13, %r35, %r34; + .loc 1 25 44 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:44 + mov.b32 {%rs7, %rs8}, %r2; + cvt.f32.bf16 %r36, %rs8; + cvt.f32.bf16 %r37, %rs7; + .loc 1 26 74 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:74 + mov.b32 {%rs9, %rs10}, %r6; + cvt.f32.bf16 %r38, %rs10; + cvt.f32.bf16 %r39, %rs9; + .loc 1 27 44 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:44 + mov.b32 {%rs11, %rs12}, %r10; + cvt.f32.bf16 %r40, %rs12; + cvt.f32.bf16 %r41, %rs11; + .loc 1 29 18 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:29:18 + fma.rn.f32 %r42, %r39, %r41, %r37; + fma.rn.f32 %r43, %r38, %r40, %r36; + .loc 1 30 36 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:36 + cvt.rn.bf16x2.f32 %r14, %r43, %r42; + .loc 1 25 44 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:44 + mov.b32 {%rs13, %rs14}, %r3; + cvt.f32.bf16 %r44, %rs14; + cvt.f32.bf16 %r45, %rs13; + .loc 1 26 74 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:74 + mov.b32 {%rs15, %rs16}, %r7; + cvt.f32.bf16 %r46, %rs16; + cvt.f32.bf16 %r47, %rs15; + .loc 1 27 44 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:44 + mov.b32 {%rs17, %rs18}, %r11; + cvt.f32.bf16 %r48, %rs18; + cvt.f32.bf16 %r49, %rs17; + .loc 1 29 18 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:29:18 + fma.rn.f32 %r50, %r47, %r49, %r45; + fma.rn.f32 %r51, %r46, %r48, %r44; + .loc 1 30 36 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:36 + cvt.rn.bf16x2.f32 %r15, %r51, %r50; + .loc 1 25 44 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:44 + mov.b32 {%rs19, %rs20}, %r4; + cvt.f32.bf16 %r52, %rs20; + cvt.f32.bf16 %r53, %rs19; + .loc 1 26 74 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:74 + mov.b32 {%rs21, %rs22}, %r8; + cvt.f32.bf16 %r54, %rs22; + cvt.f32.bf16 %r55, %rs21; + .loc 1 27 44 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:44 + mov.b32 {%rs23, %rs24}, %r12; + cvt.f32.bf16 %r56, %rs24; + cvt.f32.bf16 %r57, %rs23; + .loc 1 29 18 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:29:18 + fma.rn.f32 %r58, %r55, %r57, %r53; + fma.rn.f32 %r59, %r54, %r56, %r52; + .loc 1 30 36 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:36 + cvt.rn.bf16x2.f32 %r16, %r59, %r58; + // begin inline asm + st.global.v4.b32 [ %rd5 + 0 ], { %r13, %r14, %r15, %r16 }; + // end inline asm + .loc 1 30 4 // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 107 +.b8 97 +.b8 116 +.b8 53 +.b8 103 +.b8 55 +.b8 110 +.b8 51 +.b8 117 +.b8 117 +.b8 107 +.b8 107 +.b8 102 +.b8 119 +.b8 103 +.b8 100 +.b8 120 +.b8 102 +.b8 119 +.b8 116 +.b8 109 +.b8 120 +.b8 98 +.b8 108 +.b8 99 +.b8 109 +.b8 113 +.b8 122 +.b8 104 +.b8 98 +.b8 105 +.b8 102 +.b8 111 +.b8 53 +.b8 103 +.b8 51 +.b8 114 +.b8 98 +.b8 97 +.b8 122 +.b8 51 +.b8 100 +.b8 106 +.b8 120 +.b8 105 +.b8 105 +.b8 51 +.b8 53 +.b8 103 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 54 +.b8 107 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.source b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.source new file mode 100644 index 0000000000000000000000000000000000000000..3e7bbcc0cc0df0c77b070bdcbdfe4f4907a1a357 --- /dev/null +++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.source @@ -0,0 +1,82 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0) +#loc22 = loc("in_ptr0"(#loc)) +#loc23 = loc("in_ptr1"(#loc)) +#loc24 = loc("in_ptr2"(#loc)) +#loc25 = loc("out_ptr0"(#loc)) +#loc26 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 1048576 : i32 loc(#loc27) + %xoffset = tt.get_program_id x : i32 loc(#loc28) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc29) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc29) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc30) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc31) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc31) + %xmask = arith.constant true loc(#loc32) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc32) + %x0 = arith.constant 4096 : i32 loc(#loc33) + %x0_7 = arith.constant 4096 : i32 loc(#loc33) + %x0_8 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc33) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc33) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc34) + %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc34) + %tmp0_11 = tt.load %tmp0_10 : tensor<1024x!tt.ptr> loc(#loc35) + %tmp0_12 = arith.extf %tmp0_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc37) + %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc37) + %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc38) + %tmp1_15 = arith.extf %tmp1_14 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc40) + %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc40) + %tmp2_17 = tt.load %tmp2_16 : tensor<1024x!tt.ptr> loc(#loc41) + %tmp2_18 = arith.extf %tmp2_17 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc42) + %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<1024xf32> loc(#loc43) + %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<1024xf32> loc(#loc44) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc19) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc19) + %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc20) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc20) + tt.return loc(#loc21) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4) +#loc27 = loc("xnumel"(#loc1)) +#loc28 = loc("xoffset"(#loc2)) +#loc29 = loc("xoffset"(#loc3)) +#loc30 = loc("xindex"(#loc4)) +#loc31 = loc("xindex"(#loc5)) +#loc32 = loc("xmask"(#loc6)) +#loc33 = loc("x0"(#loc7)) +#loc34 = loc("tmp0"(#loc8)) +#loc35 = loc("tmp0"(#loc9)) +#loc36 = loc("tmp0"(#loc10)) +#loc37 = loc("tmp1"(#loc11)) +#loc38 = loc("tmp1"(#loc12)) +#loc39 = loc("tmp1"(#loc13)) +#loc40 = loc("tmp2"(#loc14)) +#loc41 = loc("tmp2"(#loc15)) +#loc42 = loc("tmp2"(#loc16)) +#loc43 = loc("tmp3"(#loc17)) +#loc44 = loc("tmp4"(#loc18)) diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttgir b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..c149c43803247cd01a3d873dc50a8fa2c6dc9d16 --- /dev/null +++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttgir @@ -0,0 +1,74 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc26) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc27) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc28) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32, #blocked> loc(#loc29) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32, #blocked> loc(#loc29) + %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32, #blocked> loc(#loc30) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc31) + %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc31) + %tmp0_4 = tt.load %tmp0_3 : tensor<1024x!tt.ptr, #blocked> loc(#loc32) + %tmp0_5 = arith.extf %tmp0_4 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc34) + %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc34) + %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc35) + %tmp1_8 = arith.extf %tmp1_7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc37) + %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc37) + %tmp2_10 = tt.load %tmp2_9 : tensor<1024x!tt.ptr, #blocked> loc(#loc38) + %tmp2_11 = arith.extf %tmp2_10 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc39) + %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<1024xf32, #blocked> loc(#loc40) + %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<1024xf32, #blocked> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc19) + tt.store %1, %2 : tensor<1024x!tt.ptr, #blocked> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4) +#loc26 = loc("xoffset"(#loc2)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xindex"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("x0"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttir b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..87847c2d4162a9a076fe808d5eba56a06e2e5cef --- /dev/null +++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttir @@ -0,0 +1,73 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %x0 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc26) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc27) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc28) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc29) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc30) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc30) + %x0_3 = arith.remsi %xindex_2, %x0 : tensor<1024xi32> loc(#loc26) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc31) + %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc31) + %tmp0_5 = tt.load %tmp0_4 : tensor<1024x!tt.ptr> loc(#loc32) + %tmp0_6 = arith.extf %tmp0_5 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc34) + %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc34) + %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc35) + %tmp1_9 = arith.extf %tmp1_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc37) + %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc37) + %tmp2_11 = tt.load %tmp2_10 : tensor<1024x!tt.ptr> loc(#loc38) + %tmp2_12 = arith.extf %tmp2_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39) + %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<1024xf32> loc(#loc40) + %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<1024xf32> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19) +#loc2 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4) +#loc26 = loc("x0"(#loc1)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xoffset"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("xindex"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/__grp__triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9716ac7f9aad2578a333078f8934c90f7a3dd29b --- /dev/null +++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/__grp__triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.json"}} \ No newline at end of file diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..dede282231445e1180b670527f652c18516baedf Binary files /dev/null and b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.cubin differ diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.json b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..af94cda9a80836f03ab87953ac57ba68d1683503 --- /dev/null +++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"hash": "8eb0e13f8d9abdd03dfdb6d2c29622061c8e0466fe03e2d95171b0862bb0ab68", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"} \ No newline at end of file diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.llir b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..d946385b57a31921e8e0863d5e3fef64e2034104 --- /dev/null +++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.llir @@ -0,0 +1,120 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 6, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 252, !dbg !9 + %11 = lshr exact i32 %10, 2, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = and i32 %9, 3, !dbg !11 + %14 = sdiv i32 %12, 32, !dbg !12 + %15 = mul i32 %14, 32, !dbg !13 + %.decomposed = sub i32 %12, %15, !dbg !13 + %16 = shl nsw i32 %.decomposed, 7, !dbg !14 + %17 = mul i32 %14, 12288, !dbg !15 + %18 = or disjoint i32 %16, %13 + %19 = add i32 %18, %17 + br label %20, !dbg !16 + +20: ; preds = %6, %20 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %20 ] + %21 = phi float [ 0.000000e+00, %6 ], [ %31, %20 ] + %22 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !17 + %23 = add i32 %19, %22, !dbg !17 + %24 = sext i32 %23 to i64, !dbg !18 + %25 = getelementptr bfloat, ptr addrspace(1) %0, i64 %24, !dbg !18 + %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %27 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %25, i64 %26, i1 true) #4, !dbg !19 + %28 = bitcast i16 %27 to bfloat, !dbg !19 + %29 = fpext bfloat %28 to float, !dbg !20 + %30 = fmul float %29, %29, !dbg !21 + %31 = fadd float %21, %30, !dbg !22 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !16 + %32 = icmp samesign ult i64 %indvars.iv, 124, !dbg !16 + br i1 %32, label %20, label %33, !dbg !16 + +33: ; preds = %20 + %34 = and i32 %9, 63, !dbg !9 + %35 = or disjoint i32 %8, %34, !dbg !10 + %36 = bitcast float %31 to i32, !dbg !23 + %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 2, i32 31), !dbg !23 + %38 = bitcast i32 %37 to float, !dbg !23 + %39 = fadd float %31, %38, !dbg !28 + %40 = bitcast float %39 to i32, !dbg !23 + %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 1, i32 31), !dbg !23 + %42 = bitcast i32 %41 to float, !dbg !23 + %43 = fadd float %39, %42, !dbg !28 + %44 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10, !dbg !29 + store float %43, ptr addrspace(3) %44, align 4, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29 + %45 = shl nuw nsw i32 %34, 2, !dbg !29 + %46 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %45, !dbg !29 + %47 = load i32, ptr addrspace(3) %46, align 4, !dbg !29 + %48 = sext i32 %35 to i64, !dbg !30 + %49 = getelementptr float, ptr addrspace(1) %1, i64 %48, !dbg !30 + %50 = and i32 %9, 192, !dbg !31 + %51 = icmp eq i32 %50, 0, !dbg !31 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %47, ptr addrspace(1) %49, i1 %51) #4, !dbg !31 + ret void, !dbg !32 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 56, scope: !4) +!16 = !DILocation(line: 32, column: 43, scope: !4) +!17 = !DILocation(line: 38, column: 50, scope: !4) +!18 = !DILocation(line: 38, column: 34, scope: !4) +!19 = !DILocation(line: 38, column: 61, scope: !4) +!20 = !DILocation(line: 38, column: 115, scope: !4) +!21 = !DILocation(line: 40, column: 22, scope: !4) +!22 = !DILocation(line: 42, column: 23, scope: !4) +!23 = !DILocation(line: 293, column: 36, scope: !24, inlinedAt: !26) +!24 = distinct !DILexicalBlockFile(scope: !4, file: !25, discriminator: 0) +!25 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!26 = !DILocation(line: 44, column: 25, scope: !27) +!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!28 = !DILocation(line: 263, column: 15, scope: !24, inlinedAt: !23) +!29 = !DILocation(line: 44, column: 28, scope: !4) +!30 = !DILocation(line: 45, column: 25, scope: !4) +!31 = !DILocation(line: 45, column: 36, scope: !4) +!32 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..79cacab3f6a802e4ef9e061c648ccead776423a7 --- /dev/null +++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ptx @@ -0,0 +1,486 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_1 +.visible .entry triton_red_fused__fused_rms_norm_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5 +) +.reqntid 256 +{ + .reg .pred %p<4>; + .reg .b16 %rs<3>; + .reg .b32 %r<33>; + .reg .b64 %rd<9>; + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_red_fused__fused_rms_norm_view_1_param_1]; + ld.param.b64 %rd2, [triton_red_fused__fused_rms_norm_view_1_param_0]; +$L__tmp0: + .loc 1 23 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28 + mov.u32 %r4, %ctaid.x; + .loc 1 23 33 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33 + shl.b32 %r1, %r4, 6; + .loc 1 24 44 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 252; + bfe.u32 %r5, %r2, 2, 6; + .loc 1 24 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23 + or.b32 %r6, %r5, %r1; + .loc 1 26 37 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37 + and.b32 %r7, %r2, 3; + .loc 1 29 19 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19 + bfe.s32 %r8, %r4, 25, 1; + shr.u32 %r9, %r8, 27; + add.s32 %r10, %r6, %r9; + shr.u32 %r11, %r10, 5; + .loc 1 32 43 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:32:43 + add.s32 %r12, %r4, %r11; + shl.b32 %r13, %r12, 13; + shl.b32 %r14, %r5, 7; + or.b32 %r15, %r13, %r14; + or.b32 %r16, %r15, %r7; + cvt.u64.u32 %rd1, %r16; + mov.b32 %r32, 0f00000000; + mov.b64 %rd8, -4; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 38 34 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34 + add.s64 %rd6, %rd1, %rd8; + cvt.u32.u64 %r17, %rd6; + add.s32 %r18, %r17, 4; + mad.wide.s32 %rd5, %r18, 2, %rd2; + .loc 1 38 61 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0; + // end inline asm + mov.b16 %rs2, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs1 }, [ %rd5 + 0 ], %rd4; + // end inline asm + .loc 1 38 115 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115 + cvt.f32.bf16 %r19, %rs1; + .loc 1 42 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23 + fma.rn.f32 %r32, %r19, %r19, %r32; + .loc 1 32 43 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:32:43 + add.s64 %rd8, %rd8, 4; + setp.lt.u64 %p2, %rd8, 124; + @%p2 bra $L__BB0_1; +// %bb.2: + .loc 1 24 44 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44 + and.b32 %r21, %r2, 63; + .loc 1 24 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23 + or.b32 %r22, %r1, %r21; +$L__tmp1: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r23, %r32, 2, 31, -1; +$L__tmp2: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r24, %r32, %r23; +$L__tmp3: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r25, %r24, 1, 31, -1; +$L__tmp4: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r26, %r24, %r25; +$L__tmp5: + .loc 1 44 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28 + mov.b32 %r27, global_smem; + add.s32 %r28, %r27, %r3; + st.shared.b32 [%r28], %r26; + bar.sync 0; + shl.b32 %r29, %r21, 2; + add.s32 %r30, %r27, %r29; + ld.shared.b32 %r20, [%r30]; + .loc 1 45 25 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25 + mad.wide.s32 %rd7, %r22, 4, %rd3; + .loc 1 45 36 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36 + and.b32 %r31, %r2, 192; + setp.eq.b32 %p3, %r31, 0; + // begin inline asm + @%p3 st.global.b32 [ %rd7 + 0 ], { %r20 }; + // end inline asm + .loc 1 45 4 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4 + ret; +$L__tmp6: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 113 +.b8 105 +.b8 116 +.b8 120 +.b8 53 +.b8 104 +.b8 119 +.b8 117 +.b8 112 +.b8 107 +.b8 98 +.b8 106 +.b8 109 +.b8 99 +.b8 115 +.b8 111 +.b8 121 +.b8 107 +.b8 113 +.b8 101 +.b8 112 +.b8 122 +.b8 113 +.b8 99 +.b8 55 +.b8 122 +.b8 99 +.b8 120 +.b8 106 +.b8 99 +.b8 98 +.b8 53 +.b8 97 +.b8 99 +.b8 113 +.b8 107 +.b8 105 +.b8 55 +.b8 122 +.b8 99 +.b8 115 +.b8 106 +.b8 105 +.b8 102 +.b8 114 +.b8 110 +.b8 114 +.b8 122 +.b8 99 +.b8 114 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 113 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp2 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.source b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..668c29aaa5214a312af10fa56f2b302a3dedf3b5 --- /dev/null +++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 65536 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 64 : i32 loc(#loc49) + %xoffset_3 = arith.constant 64 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<64x4xi1> loc(#loc53) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c4_i32 = arith.constant 4 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x4xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<64x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<64x4xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<64x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<64x4xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<64x4xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<64x4xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<64x4xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<64x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x4xf32> loc("input"(#loc33))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc34) + tt.return %0 : tensor<64xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc37) + tt.return %1 : tensor<64xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f03f4df9b0f1c9928db9a993cc2d4e1f3aef89eb --- /dev/null +++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttgir @@ -0,0 +1,121 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc1 = loc(unknown) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc30 = loc("in_ptr0"(#loc)) +#loc31 = loc("out_ptr0"(#loc)) +#loc32 = loc("xnumel"(#loc)) +#loc33 = loc("r0_numel"(#loc)) +#loc54 = loc("tmp4"(#loc24)) +#loc57 = loc(callsite(#loc1 at #loc54)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36) + %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc36) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc36) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc37) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc37) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc37) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc37) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc38) + %x0 = arith.remsi %xindex_11, %cst : tensor<64x1xi32, #blocked> loc(#loc39) + %x1 = arith.divsi %xindex_11, %cst : tensor<64x1xi32, #blocked> loc(#loc40) + %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc41) + %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc42) + %tmp0_15 = arith.muli %x1, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc43) + %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc44) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc45) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_20 = %cst_4) -> (tensor<64x4xf32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc47) + %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x4xi32, #blocked> loc(#loc47) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst_3 : tensor<1x4xi32, #blocked> loc(#loc48) + %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc42) + %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<64x4xi32, #blocked> loc(#loc42) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<64x4xi32, #blocked> loc(#loc44) + %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc45) + %tmp0_26 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc49) + %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_2 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc49) + %tmp0_28 = arith.extf %tmp0_27 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc50) + %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<64x4xf32, #blocked> loc(#loc51) + %tmp5 = arith.addf %_tmp4_20, %tmp2 : tensor<64x4xf32, #blocked> loc(#loc52) + %_tmp4_29 = arith.select %tmp0_26, %tmp5, %_tmp4_20 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc53) + scf.yield %_tmp4_29 : tensor<64x4xf32, #blocked> loc(#loc22) + } loc(#loc46) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))): + %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58) + tt.reduce.return %tmp4_22 : f32 loc(#loc56) + }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56) + %tmp4_18 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55) + %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc55) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc27) + %1 = tt.addptr %0, %xindex_12 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc27) + tt.store %1, %tmp4_19 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc28) + tt.return loc(#loc29) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc34 = loc("xoffset"(#loc2)) +#loc35 = loc("xoffset"(#loc3)) +#loc36 = loc("xindex"(#loc4)) +#loc37 = loc("xindex"(#loc5)) +#loc38 = loc("r0_base"(#loc6)) +#loc39 = loc("x0"(#loc7)) +#loc40 = loc("x1"(#loc8)) +#loc41 = loc("tmp0"(#loc9)) +#loc42 = loc("tmp0"(#loc10)) +#loc43 = loc("tmp0"(#loc11)) +#loc44 = loc("tmp0"(#loc12)) +#loc45 = loc("tmp0"(#loc13)) +#loc46 = loc("_tmp4"(#loc14)) +#loc47 = loc("r0_index"(#loc15)) +#loc48 = loc("r0_mask"(#loc16)) +#loc49 = loc("tmp0"(#loc17)) +#loc50 = loc("tmp0"(#loc18)) +#loc51 = loc("tmp2"(#loc19)) +#loc52 = loc("tmp5"(#loc20)) +#loc53 = loc("_tmp4"(#loc21)) +#loc55 = loc("tmp4"(#loc26)) +#loc56 = loc(callsite(#loc23 at #loc54)) +#loc58 = loc(callsite(#loc25 at #loc56)) diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a17336b6cd6c0e1646710a7e673b9676c2869e0b --- /dev/null +++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttir @@ -0,0 +1,118 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc1 = loc(unknown) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc32 = loc("in_ptr0"(#loc)) +#loc33 = loc("out_ptr0"(#loc)) +#loc34 = loc("xnumel"(#loc)) +#loc35 = loc("r0_numel"(#loc)) +#loc58 = loc("tmp4"(#loc26)) +#loc61 = loc(callsite(#loc1 at #loc58)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc2) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<1x4xi32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc36) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc37) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc38) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc39) + %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc40) + %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc40) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc41) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc42) + %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc43) + %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc44) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_11 = %cst_3) -> (tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc46) + %r0_index_12 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc46) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x4xi32> loc(#loc47) + %tmp0 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc48) + %tmp0_13 = tt.broadcast %r0_index_12 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc49) + %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc49) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<64x4xi32> loc(#loc49) + %tmp0_16 = arith.muli %x1, %cst_0 : tensor<64x1xi32> loc(#loc50) + %tmp0_17 = tt.broadcast %tmp0_16 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc51) + %tmp0_18 = arith.addi %tmp0_15, %tmp0_17 : tensor<64x4xi32> loc(#loc51) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc52) + %tmp0_20 = tt.addptr %tmp0_19, %tmp0_18 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc52) + %tmp0_21 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc53) + %tmp0_22 = tt.load %tmp0_20, %tmp0_21, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc53) + %tmp0_23 = arith.extf %tmp0_22 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc54) + %tmp2 = arith.mulf %tmp0_23, %tmp0_23 : tensor<64x4xf32> loc(#loc55) + %tmp5 = arith.addf %_tmp4_11, %tmp2 : tensor<64x4xf32> loc(#loc56) + %_tmp4_24 = arith.select %tmp0_21, %tmp5, %_tmp4_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc57) + scf.yield %_tmp4_24 : tensor<64x4xf32> loc(#loc24) + } loc(#loc45) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_11: f32 loc(callsite(#loc1 at #loc58)), %tmp4_12: f32 loc(callsite(#loc1 at #loc58))): + %tmp4_13 = arith.addf %tmp4_11, %tmp4_12 : f32 loc(#loc62) + tt.reduce.return %tmp4_13 : f32 loc(#loc60) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc60) + %tmp4_10 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc59) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc29) + %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc29) + tt.store %1, %tmp4_10 : tensor<64x1x!tt.ptr> loc(#loc30) + tt.return loc(#loc31) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc36 = loc("xoffset"(#loc3)) +#loc37 = loc("xoffset"(#loc4)) +#loc38 = loc("xindex"(#loc5)) +#loc39 = loc("xindex"(#loc6)) +#loc40 = loc("xindex"(#loc7)) +#loc41 = loc("r0_base"(#loc8)) +#loc42 = loc("r0_base"(#loc9)) +#loc43 = loc("x0"(#loc10)) +#loc44 = loc("x1"(#loc11)) +#loc45 = loc("_tmp4"(#loc2)) +#loc46 = loc("r0_index"(#loc12)) +#loc47 = loc("r0_mask"(#loc13)) +#loc48 = loc("tmp0"(#loc14)) +#loc49 = loc("tmp0"(#loc15)) +#loc50 = loc("tmp0"(#loc16)) +#loc51 = loc("tmp0"(#loc17)) +#loc52 = loc("tmp0"(#loc18)) +#loc53 = loc("tmp0"(#loc19)) +#loc54 = loc("tmp0"(#loc20)) +#loc55 = loc("tmp2"(#loc21)) +#loc56 = loc("tmp5"(#loc22)) +#loc57 = loc("_tmp4"(#loc23)) +#loc59 = loc("tmp4"(#loc28)) +#loc60 = loc(callsite(#loc25 at #loc58)) +#loc62 = loc(callsite(#loc27 at #loc60)) diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..437a9fa9e3e5b6ec89ecc05c10df6e70ac93c49c --- /dev/null +++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}} \ No newline at end of file diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..fc08911e17a8343c307df373a327389a91fde45d Binary files /dev/null and b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a6b79ea1c064c00215e5996dbbb2d9fa83ce132e --- /dev/null +++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"hash": "8a233f1e86193a5516e1f31dfb239201f7e06d42a2e572f891a8dd47d6ac6ee0", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"} \ No newline at end of file diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..be548701fb837c24d0c458c7bfa86b08fd3b470c --- /dev/null +++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir @@ -0,0 +1,891 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %13 = shl i32 %12, 2, !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %15 = and i32 %14, 127, !dbg !10 + %16 = and i32 %14, 96, !dbg !10 + %17 = lshr exact i32 %16, 5, !dbg !10 + %18 = and i32 %14, 3, !dbg !10 + %19 = or disjoint i32 %17, %13, !dbg !11 + %20 = or disjoint i32 %13, %18, !dbg !11 + %21 = shl nuw nsw i32 %14, 2, !dbg !12 + %22 = and i32 %21, 124, !dbg !12 + %23 = and i32 %14, 124, !dbg !12 + %24 = lshr i32 %14, 2, !dbg !12 + %25 = sdiv i32 %19, 32, !dbg !13 + %26 = mul i32 %25, 32, !dbg !14 + %.decomposed = sub i32 %19, %26, !dbg !14 + %27 = sdiv i32 %20, 32, !dbg !13 + %28 = or disjoint i32 %22, 4096, !dbg !15 + %29 = shl nsw i32 %.decomposed, 7, !dbg !16 + %30 = add nsw i32 %28, %29, !dbg !17 + %31 = mul i32 %25, 36864, !dbg !18 + %32 = add i32 %30, %31, !dbg !19 + %33 = sext i32 %32 to i64, !dbg !20 + %34 = getelementptr bfloat, ptr addrspace(1) %2, i64 %33, !dbg !20 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %36 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %34, i64 %35, i1 true) #6, !dbg !21 + %37 = extractvalue { i32, i32 } %36, 0, !dbg !21 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !21 + %39 = extractvalue { i32, i32 } %36, 1, !dbg !21 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !21 + %41 = extractelement <2 x bfloat> %38, i64 0, !dbg !21 + %42 = extractelement <2 x bfloat> %38, i64 1, !dbg !21 + %43 = extractelement <2 x bfloat> %40, i64 0, !dbg !21 + %44 = extractelement <2 x bfloat> %40, i64 1, !dbg !21 + %45 = fpext bfloat %41 to float, !dbg !22 + %46 = fpext bfloat %42 to float, !dbg !22 + %47 = fpext bfloat %43 to float, !dbg !22 + %48 = fpext bfloat %44 to float, !dbg !22 + %49 = or disjoint i32 %29, %22, !dbg !23 + %50 = add i32 %49, %31, !dbg !24 + %51 = sext i32 %50 to i64, !dbg !25 + %52 = getelementptr bfloat, ptr addrspace(1) %2, i64 %51, !dbg !25 + %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !26 + %54 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %52, i64 %53, i1 true) #6, !dbg !26 + %55 = extractvalue { i32, i32 } %54, 0, !dbg !26 + %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !26 + %57 = extractvalue { i32, i32 } %54, 1, !dbg !26 + %58 = bitcast i32 %57 to <2 x bfloat>, !dbg !26 + %59 = extractelement <2 x bfloat> %56, i64 0, !dbg !26 + %60 = extractelement <2 x bfloat> %56, i64 1, !dbg !26 + %61 = extractelement <2 x bfloat> %58, i64 0, !dbg !26 + %62 = extractelement <2 x bfloat> %58, i64 1, !dbg !26 + %63 = fpext bfloat %59 to float, !dbg !27 + %64 = fpext bfloat %60 to float, !dbg !27 + %65 = fpext bfloat %61 to float, !dbg !27 + %66 = fpext bfloat %62 to float, !dbg !27 + %67 = fmul float %45, %45, !dbg !28 + %68 = fmul float %46, %46, !dbg !28 + %69 = fmul float %47, %47, !dbg !28 + %70 = fmul float %48, %48, !dbg !28 + %71 = fmul float %63, %63, !dbg !29 + %72 = fmul float %64, %64, !dbg !29 + %73 = fmul float %65, %65, !dbg !29 + %74 = fmul float %66, %66, !dbg !29 + %75 = fadd float %67, %68, !dbg !30 + %76 = fadd float %69, %75, !dbg !30 + %77 = fadd float %70, %76, !dbg !30 + %78 = bitcast float %77 to i32, !dbg !33 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 16, i32 31), !dbg !33 + %80 = bitcast i32 %79 to float, !dbg !33 + %81 = fadd float %77, %80, !dbg !30 + %82 = bitcast float %81 to i32, !dbg !33 + %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 8, i32 31), !dbg !33 + %84 = bitcast i32 %83 to float, !dbg !33 + %85 = fadd float %81, %84, !dbg !30 + %86 = bitcast float %85 to i32, !dbg !33 + %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 4, i32 31), !dbg !33 + %88 = bitcast i32 %87 to float, !dbg !33 + %89 = fadd float %85, %88, !dbg !30 + %90 = bitcast float %89 to i32, !dbg !33 + %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 2, i32 31), !dbg !33 + %92 = bitcast i32 %91 to float, !dbg !33 + %93 = fadd float %89, %92, !dbg !30 + %94 = bitcast float %93 to i32, !dbg !33 + %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 1, i32 31), !dbg !33 + %96 = bitcast i32 %95 to float, !dbg !33 + %97 = fadd float %93, %96, !dbg !30 + %98 = fadd float %71, %72, !dbg !36 + %99 = fadd float %73, %98, !dbg !36 + %100 = fadd float %74, %99, !dbg !36 + %101 = bitcast float %100 to i32, !dbg !37 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 16, i32 31), !dbg !37 + %103 = bitcast i32 %102 to float, !dbg !37 + %104 = fadd float %100, %103, !dbg !36 + %105 = bitcast float %104 to i32, !dbg !37 + %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 8, i32 31), !dbg !37 + %107 = bitcast i32 %106 to float, !dbg !37 + %108 = fadd float %104, %107, !dbg !36 + %109 = bitcast float %108 to i32, !dbg !37 + %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 4, i32 31), !dbg !37 + %111 = bitcast i32 %110 to float, !dbg !37 + %112 = fadd float %108, %111, !dbg !36 + %113 = bitcast float %112 to i32, !dbg !37 + %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 2, i32 31), !dbg !37 + %115 = bitcast i32 %114 to float, !dbg !37 + %116 = fadd float %112, %115, !dbg !36 + %117 = bitcast float %116 to i32, !dbg !37 + %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !37 + %119 = bitcast i32 %118 to float, !dbg !37 + %120 = fadd float %116, %119, !dbg !36 + %121 = and i32 %24, 1, !dbg !39 + %122 = zext nneg i32 %15 to i64, !dbg !40 + %123 = getelementptr bfloat, ptr addrspace(1) %3, i64 %122, !dbg !40 + %124 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41 + %125 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %123, i64 %124, i1 true) #6, !dbg !41 + %126 = bitcast i16 %125 to bfloat, !dbg !41 + %127 = fpext bfloat %126 to float, !dbg !42 + %128 = shl i32 %25, 7, !dbg !43 + %129 = or disjoint i32 %128, %22, !dbg !44 + %130 = sext i32 %129 to i64, !dbg !45 + %131 = getelementptr float, ptr addrspace(1) %4, i64 %130, !dbg !45 + %132 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !46 + %133 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %131, i64 %132, i1 true) #6, !dbg !46 + %134 = extractvalue { i32, i32, i32, i32 } %133, 0, !dbg !46 + %135 = extractvalue { i32, i32, i32, i32 } %133, 1, !dbg !46 + %136 = extractvalue { i32, i32, i32, i32 } %133, 2, !dbg !46 + %137 = extractvalue { i32, i32, i32, i32 } %133, 3, !dbg !46 + %138 = bitcast i32 %134 to float, !dbg !46 + %139 = bitcast i32 %135 to float, !dbg !46 + %140 = bitcast i32 %136 to float, !dbg !46 + %141 = bitcast i32 %137 to float, !dbg !46 + %142 = and i32 %14, 7, !dbg !46 + %143 = shl nuw nsw i32 %142, 4, !dbg !46 + %144 = shl nuw nsw i32 %16, 2, !dbg !46 + %145 = lshr i32 %14, 1, !dbg !46 + %146 = and i32 %145, 12, !dbg !46 + %147 = or disjoint i32 %143, %144, !dbg !46 + %148 = or disjoint i32 %146, %16, !dbg !46 + %149 = xor i32 %147, %148, !dbg !46 + %150 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %149, !dbg !46 + %151 = insertelement <1 x i32> poison, i32 %134, i64 0, !dbg !46 + store <1 x i32> %151, ptr addrspace(3) %150, align 4, !dbg !46 + %152 = xor i32 %149, 516, !dbg !46 + %153 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %152, !dbg !46 + %154 = insertelement <1 x i32> poison, i32 %135, i64 0, !dbg !46 + store <1 x i32> %154, ptr addrspace(3) %153, align 4, !dbg !46 + %155 = xor i32 %149, 1032, !dbg !46 + %156 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %155, !dbg !46 + %157 = insertelement <1 x i32> poison, i32 %136, i64 0, !dbg !46 + store <1 x i32> %157, ptr addrspace(3) %156, align 4, !dbg !46 + %158 = xor i32 %149, 1548, !dbg !46 + %159 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %158, !dbg !46 + %160 = insertelement <1 x i32> poison, i32 %137, i64 0, !dbg !46 + store <1 x i32> %160, ptr addrspace(3) %159, align 4, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %161 = shl nuw nsw i32 %14, 7, !dbg !46 + %162 = and i32 %161, 1920, !dbg !46 + %163 = shl nuw nsw i32 %18, 5, !dbg !46 + %164 = xor i32 %163, %23, !dbg !46 + %165 = or disjoint i32 %164, %162, !dbg !46 + %166 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %165, !dbg !46 + %167 = load float, ptr addrspace(3) %166, align 4, !dbg !46 + %168 = xor i32 %165, 4, !dbg !46 + %169 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %168, !dbg !46 + %170 = load float, ptr addrspace(3) %169, align 4, !dbg !46 + %171 = xor i32 %165, 8, !dbg !46 + %172 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %171, !dbg !46 + %173 = load float, ptr addrspace(3) %172, align 4, !dbg !46 + %174 = xor i32 %165, 12, !dbg !46 + %175 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %174, !dbg !46 + %176 = load float, ptr addrspace(3) %175, align 4, !dbg !46 + %177 = getelementptr float, ptr addrspace(1) %5, i64 %130, !dbg !47 + %178 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48 + %179 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %177, i64 %178, i1 true) #6, !dbg !48 + %180 = extractvalue { i32, i32, i32, i32 } %179, 0, !dbg !48 + %181 = extractvalue { i32, i32, i32, i32 } %179, 1, !dbg !48 + %182 = extractvalue { i32, i32, i32, i32 } %179, 2, !dbg !48 + %183 = extractvalue { i32, i32, i32, i32 } %179, 3, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %184 = insertelement <1 x i32> poison, i32 %180, i64 0, !dbg !48 + store <1 x i32> %184, ptr addrspace(3) %150, align 4, !dbg !48 + %185 = insertelement <1 x i32> poison, i32 %181, i64 0, !dbg !48 + store <1 x i32> %185, ptr addrspace(3) %153, align 4, !dbg !48 + %186 = insertelement <1 x i32> poison, i32 %182, i64 0, !dbg !48 + store <1 x i32> %186, ptr addrspace(3) %156, align 4, !dbg !48 + %187 = insertelement <1 x i32> poison, i32 %183, i64 0, !dbg !48 + store <1 x i32> %187, ptr addrspace(3) %159, align 4, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %188 = load float, ptr addrspace(3) %166, align 4, !dbg !48 + %189 = load float, ptr addrspace(3) %169, align 4, !dbg !48 + %190 = load float, ptr addrspace(3) %172, align 4, !dbg !48 + %191 = load float, ptr addrspace(3) %175, align 4, !dbg !48 + %192 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %193 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %34, i64 %192, i1 true) #6, !dbg !49 + %194 = getelementptr bfloat, ptr addrspace(1) %6, i64 %122, !dbg !50 + %195 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %196 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %194, i64 %195, i1 true) #6, !dbg !51 + %197 = icmp eq i32 %121, 0, !dbg !52 + %198 = and i32 %24, 30, !dbg !53 + %199 = or disjoint i32 %198, 32, !dbg !53 + %200 = or disjoint i32 %198, 64, !dbg !53 + %201 = or disjoint i32 %198, 96, !dbg !53 + %202 = or disjoint i32 %198, 1, !dbg !54 + %203 = or disjoint i32 %198, 33, !dbg !54 + %204 = or disjoint i32 %198, 65, !dbg !54 + %205 = or disjoint i32 %198, 97, !dbg !54 + %206 = shl i32 %20, 7, !dbg !55 + %207 = shl i32 %27, 15, !dbg !55 + %208 = add i32 %207, %206, !dbg !55 + %209 = or disjoint i32 %208, %202, !dbg !56 + %210 = or disjoint i32 %208, %203, !dbg !56 + %211 = or disjoint i32 %208, %204, !dbg !56 + %212 = or disjoint i32 %208, %205, !dbg !56 + %213 = sext i32 %209 to i64, !dbg !57 + %214 = getelementptr bfloat, ptr addrspace(1) %2, i64 %213, !dbg !57 + %215 = sext i32 %210 to i64, !dbg !57 + %216 = getelementptr bfloat, ptr addrspace(1) %2, i64 %215, !dbg !57 + %217 = sext i32 %211 to i64, !dbg !57 + %218 = getelementptr bfloat, ptr addrspace(1) %2, i64 %217, !dbg !57 + %219 = sext i32 %212 to i64, !dbg !57 + %220 = getelementptr bfloat, ptr addrspace(1) %2, i64 %219, !dbg !57 + %221 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %222 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %214, i64 %221, i1 %197) #6, !dbg !58 + %223 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %224 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %216, i64 %223, i1 %197) #6, !dbg !58 + %225 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %226 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %218, i64 %225, i1 %197) #6, !dbg !58 + %227 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %228 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %220, i64 %227, i1 %197) #6, !dbg !58 + %229 = tail call float @llvm.nvvm.div.full(float %120, float 1.280000e+02), !dbg !59 + %230 = fadd float %229, 0x3EB0C6F7A0000000, !dbg !60 + %231 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not.i = icmp eq i32 %231, 0, !dbg !61 + br i1 %.not.i, label %234, label %232, !dbg !61 + +232: ; preds = %11 + %233 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %230), !dbg !61 + br label %__nv_rsqrtf.exit, !dbg !61 + +234: ; preds = %11 + %235 = tail call float @llvm.nvvm.rsqrt.approx.f(float %230), !dbg !61 + br label %__nv_rsqrtf.exit, !dbg !61 + +__nv_rsqrtf.exit: ; preds = %232, %234 + %.0.i = phi float [ %233, %232 ], [ %235, %234 ], !dbg !61 + %236 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not.i7 = icmp eq i32 %238, 0, !dbg !61 + br i1 %.not.i7, label %241, label %239, !dbg !61 + +239: ; preds = %__nv_rsqrtf.exit + %240 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %230), !dbg !61 + br label %__nv_rsqrtf.exit9, !dbg !61 + +241: ; preds = %__nv_rsqrtf.exit + %242 = tail call float @llvm.nvvm.rsqrt.approx.f(float %230), !dbg !61 + br label %__nv_rsqrtf.exit9, !dbg !61 + +__nv_rsqrtf.exit9: ; preds = %239, %241 + %.0.i8 = phi float [ %240, %239 ], [ %242, %241 ], !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62 + %243 = lshr exact i32 %16, 3, !dbg !62 + %244 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %243, !dbg !62 + store float %.0.i, ptr addrspace(3) %244, align 4, !dbg !62 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62 + %245 = shl nuw nsw i32 %18, 2, !dbg !62 + %246 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %245, !dbg !62 + %247 = load float, ptr addrspace(3) %246, align 4, !dbg !62 + %248 = zext nneg i32 %202 to i64, !dbg !63 + %249 = getelementptr bfloat, ptr addrspace(1) %3, i64 %248, !dbg !63 + %250 = zext nneg i32 %203 to i64, !dbg !63 + %251 = getelementptr bfloat, ptr addrspace(1) %3, i64 %250, !dbg !63 + %252 = zext nneg i32 %204 to i64, !dbg !63 + %253 = getelementptr bfloat, ptr addrspace(1) %3, i64 %252, !dbg !63 + %254 = zext nneg i32 %205 to i64, !dbg !63 + %255 = getelementptr bfloat, ptr addrspace(1) %3, i64 %254, !dbg !63 + %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %257 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %249, i64 %256, i1 %197) #6, !dbg !64 + %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %251, i64 %258, i1 %197) #6, !dbg !64 + %260 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %261 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %253, i64 %260, i1 %197) #6, !dbg !64 + %262 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %263 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %255, i64 %262, i1 %197) #6, !dbg !64 + %264 = icmp ne i32 %121, 0, !dbg !65 + %265 = or disjoint i32 %208, %198, !dbg !66 + %266 = or disjoint i32 %208, %199, !dbg !66 + %267 = or disjoint i32 %208, %200, !dbg !66 + %268 = or disjoint i32 %208, %201, !dbg !66 + %269 = sext i32 %265 to i64, !dbg !67 + %270 = getelementptr bfloat, ptr addrspace(1) %2, i64 %269, !dbg !67 + %271 = sext i32 %266 to i64, !dbg !67 + %272 = getelementptr bfloat, ptr addrspace(1) %2, i64 %271, !dbg !67 + %273 = sext i32 %267 to i64, !dbg !67 + %274 = getelementptr bfloat, ptr addrspace(1) %2, i64 %273, !dbg !67 + %275 = sext i32 %268 to i64, !dbg !67 + %276 = getelementptr bfloat, ptr addrspace(1) %2, i64 %275, !dbg !67 + %277 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %278 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %270, i64 %277, i1 %264) #6, !dbg !68 + %279 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %280 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %272, i64 %279, i1 %264) #6, !dbg !68 + %281 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %282 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %274, i64 %281, i1 %264) #6, !dbg !68 + %283 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68 + %284 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %276, i64 %283, i1 %264) #6, !dbg !68 + %285 = zext nneg i32 %198 to i64, !dbg !69 + %286 = getelementptr bfloat, ptr addrspace(1) %3, i64 %285, !dbg !69 + %287 = zext nneg i32 %199 to i64, !dbg !69 + %288 = getelementptr bfloat, ptr addrspace(1) %3, i64 %287, !dbg !69 + %289 = zext nneg i32 %200 to i64, !dbg !69 + %290 = getelementptr bfloat, ptr addrspace(1) %3, i64 %289, !dbg !69 + %291 = zext nneg i32 %201 to i64, !dbg !69 + %292 = getelementptr bfloat, ptr addrspace(1) %3, i64 %291, !dbg !69 + %293 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %293, i1 %264) #6, !dbg !70 + %295 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %296 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %295, i1 %264) #6, !dbg !70 + %297 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %298 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %297, i1 %264) #6, !dbg !70 + %299 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70 + %300 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %292, i64 %299, i1 %264) #6, !dbg !70 + %301 = fmul float %.0.i8, %63, !dbg !71 + %302 = fmul float %.0.i8, %64, !dbg !71 + %303 = fmul float %.0.i8, %65, !dbg !71 + %304 = fmul float %.0.i8, %66, !dbg !71 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !72 + %305 = shl nuw nsw i32 %18, 7, !dbg !72 + %306 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %305, !dbg !72 + %307 = getelementptr inbounds nuw i8, ptr addrspace(3) %306, i32 %164, !dbg !72 + store float %127, ptr addrspace(3) %307, align 4, !dbg !72 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !72 + %308 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %22, !dbg !72 + %309 = load float, ptr addrspace(3) %308, align 4, !dbg !72 + %310 = xor i32 %22, 160, !dbg !72 + %311 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %310, !dbg !72 + %312 = load float, ptr addrspace(3) %311, align 4, !dbg !72 + %313 = xor i32 %22, 320, !dbg !72 + %314 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %313, !dbg !72 + %315 = load float, ptr addrspace(3) %314, align 4, !dbg !72 + %316 = xor i32 %22, 480, !dbg !72 + %317 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %316, !dbg !72 + %318 = load float, ptr addrspace(3) %317, align 4, !dbg !72 + %319 = fmul float %301, %309, !dbg !72 + %320 = fmul float %302, %312, !dbg !72 + %321 = fmul float %303, %315, !dbg !72 + %322 = fmul float %304, %318, !dbg !72 + %323 = fmul float %319, %138, !dbg !73 + %324 = fmul float %320, %139, !dbg !73 + %325 = fmul float %321, %140, !dbg !73 + %326 = fmul float %322, %141, !dbg !73 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73 + store float %323, ptr addrspace(3) %150, align 4, !dbg !73 + store float %324, ptr addrspace(3) %153, align 4, !dbg !73 + store float %325, ptr addrspace(3) %156, align 4, !dbg !73 + store float %326, ptr addrspace(3) %159, align 4, !dbg !73 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73 + %327 = load float, ptr addrspace(3) %166, align 4, !dbg !73 + %328 = load float, ptr addrspace(3) %169, align 4, !dbg !73 + %329 = load float, ptr addrspace(3) %172, align 4, !dbg !73 + %330 = load float, ptr addrspace(3) %175, align 4, !dbg !73 + %331 = add i32 %208, 4097, !dbg !74 + %332 = or disjoint i32 %331, %198, !dbg !75 + %333 = add i32 %208, 4129, !dbg !74 + %334 = or disjoint i32 %333, %198, !dbg !75 + %335 = add i32 %208, 4161, !dbg !74 + %336 = or disjoint i32 %335, %198, !dbg !75 + %337 = add i32 %208, 4193, !dbg !74 + %338 = or disjoint i32 %337, %198, !dbg !75 + %339 = sext i32 %332 to i64, !dbg !76 + %340 = getelementptr bfloat, ptr addrspace(1) %2, i64 %339, !dbg !76 + %341 = sext i32 %334 to i64, !dbg !76 + %342 = getelementptr bfloat, ptr addrspace(1) %2, i64 %341, !dbg !76 + %343 = sext i32 %336 to i64, !dbg !76 + %344 = getelementptr bfloat, ptr addrspace(1) %2, i64 %343, !dbg !76 + %345 = sext i32 %338 to i64, !dbg !76 + %346 = getelementptr bfloat, ptr addrspace(1) %2, i64 %345, !dbg !76 + %347 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77 + %348 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %340, i64 %347, i1 %197) #6, !dbg !77 + %349 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77 + %350 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %342, i64 %349, i1 %197) #6, !dbg !77 + %351 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77 + %352 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %344, i64 %351, i1 %197) #6, !dbg !77 + %353 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77 + %354 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %346, i64 %353, i1 %197) #6, !dbg !77 + %355 = tail call float @llvm.nvvm.div.full(float %97, float 1.280000e+02), !dbg !78 + %356 = fadd float %355, 0x3EB0C6F7A0000000, !dbg !79 + %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80 + %.not.i10 = icmp eq i32 %357, 0, !dbg !80 + br i1 %.not.i10, label %360, label %358, !dbg !80 + +358: ; preds = %__nv_rsqrtf.exit9 + %359 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %356), !dbg !80 + br label %__nv_rsqrtf.exit12, !dbg !80 + +360: ; preds = %__nv_rsqrtf.exit9 + %361 = tail call float @llvm.nvvm.rsqrt.approx.f(float %356), !dbg !80 + br label %__nv_rsqrtf.exit12, !dbg !80 + +__nv_rsqrtf.exit12: ; preds = %358, %360 + %.0.i11 = phi float [ %359, %358 ], [ %361, %360 ], !dbg !80 + %362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80 + %363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80 + %364 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80 + %.not.i19 = icmp eq i32 %364, 0, !dbg !80 + br i1 %.not.i19, label %367, label %365, !dbg !80 + +365: ; preds = %__nv_rsqrtf.exit12 + %366 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %356), !dbg !80 + br label %__nv_rsqrtf.exit21, !dbg !80 + +367: ; preds = %__nv_rsqrtf.exit12 + %368 = tail call float @llvm.nvvm.rsqrt.approx.f(float %356), !dbg !80 + br label %__nv_rsqrtf.exit21, !dbg !80 + +__nv_rsqrtf.exit21: ; preds = %365, %367 + %.0.i20 = phi float [ %366, %365 ], [ %368, %367 ], !dbg !80 + %369 = bitcast i16 %354 to bfloat, !dbg !77 + %370 = fpext bfloat %369 to float, !dbg !81 + %371 = bitcast i16 %352 to bfloat, !dbg !77 + %372 = fpext bfloat %371 to float, !dbg !81 + %373 = bitcast i16 %350 to bfloat, !dbg !77 + %374 = fpext bfloat %373 to float, !dbg !81 + %375 = bitcast i16 %348 to bfloat, !dbg !77 + %376 = fpext bfloat %375 to float, !dbg !81 + %377 = bitcast i16 %228 to bfloat, !dbg !58 + %378 = fpext bfloat %377 to float, !dbg !82 + %379 = fmul float %247, %378, !dbg !62 + %380 = bitcast i16 %263 to bfloat, !dbg !64 + %381 = fpext bfloat %380 to float, !dbg !83 + %382 = fmul float %379, %381, !dbg !84 + %383 = fsub float 0.000000e+00, %382, !dbg !85 + %384 = bitcast i16 %284 to bfloat, !dbg !68 + %385 = fpext bfloat %384 to float, !dbg !86 + %386 = fmul float %247, %385, !dbg !87 + %387 = bitcast i16 %300 to bfloat, !dbg !70 + %388 = fpext bfloat %387 to float, !dbg !88 + %389 = fmul float %386, %388, !dbg !89 + %390 = select i1 %197, float %383, float %389, !dbg !90 + %391 = fmul float %191, %390, !dbg !91 + %392 = fadd float %391, %330, !dbg !92 + %393 = bitcast i16 %226 to bfloat, !dbg !58 + %394 = fpext bfloat %393 to float, !dbg !82 + %395 = fmul float %247, %394, !dbg !62 + %396 = bitcast i16 %261 to bfloat, !dbg !64 + %397 = fpext bfloat %396 to float, !dbg !83 + %398 = fmul float %395, %397, !dbg !84 + %399 = fsub float 0.000000e+00, %398, !dbg !85 + %400 = bitcast i16 %282 to bfloat, !dbg !68 + %401 = fpext bfloat %400 to float, !dbg !86 + %402 = fmul float %247, %401, !dbg !87 + %403 = bitcast i16 %298 to bfloat, !dbg !70 + %404 = fpext bfloat %403 to float, !dbg !88 + %405 = fmul float %402, %404, !dbg !89 + %406 = select i1 %197, float %399, float %405, !dbg !90 + %407 = fmul float %190, %406, !dbg !91 + %408 = fadd float %407, %329, !dbg !92 + %409 = bitcast i16 %224 to bfloat, !dbg !58 + %410 = fpext bfloat %409 to float, !dbg !82 + %411 = fmul float %247, %410, !dbg !62 + %412 = bitcast i16 %259 to bfloat, !dbg !64 + %413 = fpext bfloat %412 to float, !dbg !83 + %414 = fmul float %411, %413, !dbg !84 + %415 = fsub float 0.000000e+00, %414, !dbg !85 + %416 = bitcast i16 %280 to bfloat, !dbg !68 + %417 = fpext bfloat %416 to float, !dbg !86 + %418 = fmul float %247, %417, !dbg !87 + %419 = bitcast i16 %296 to bfloat, !dbg !70 + %420 = fpext bfloat %419 to float, !dbg !88 + %421 = fmul float %418, %420, !dbg !89 + %422 = select i1 %197, float %415, float %421, !dbg !90 + %423 = fmul float %189, %422, !dbg !91 + %424 = fadd float %423, %328, !dbg !92 + %425 = bitcast i16 %222 to bfloat, !dbg !58 + %426 = fpext bfloat %425 to float, !dbg !82 + %427 = fmul float %247, %426, !dbg !62 + %428 = bitcast i16 %257 to bfloat, !dbg !64 + %429 = fpext bfloat %428 to float, !dbg !83 + %430 = fmul float %427, %429, !dbg !84 + %431 = fsub float 0.000000e+00, %430, !dbg !85 + %432 = bitcast i16 %278 to bfloat, !dbg !68 + %433 = fpext bfloat %432 to float, !dbg !86 + %434 = fmul float %247, %433, !dbg !87 + %435 = bitcast i16 %294 to bfloat, !dbg !70 + %436 = fpext bfloat %435 to float, !dbg !88 + %437 = fmul float %434, %436, !dbg !89 + %438 = select i1 %197, float %431, float %437, !dbg !90 + %439 = fmul float %188, %438, !dbg !91 + %440 = fadd float %439, %327, !dbg !92 + %441 = bitcast i16 %196 to bfloat, !dbg !51 + %442 = fpext bfloat %441 to float, !dbg !93 + %443 = extractvalue { i32, i32 } %193, 1, !dbg !49 + %444 = bitcast i32 %443 to <2 x bfloat>, !dbg !49 + %445 = extractelement <2 x bfloat> %444, i64 1, !dbg !49 + %446 = fpext bfloat %445 to float, !dbg !94 + %447 = extractelement <2 x bfloat> %444, i64 0, !dbg !49 + %448 = fpext bfloat %447 to float, !dbg !94 + %449 = extractvalue { i32, i32 } %193, 0, !dbg !49 + %450 = bitcast i32 %449 to <2 x bfloat>, !dbg !49 + %451 = extractelement <2 x bfloat> %450, i64 1, !dbg !49 + %452 = fpext bfloat %451 to float, !dbg !94 + %453 = extractelement <2 x bfloat> %450, i64 0, !dbg !49 + %454 = fpext bfloat %453 to float, !dbg !94 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !95 + store float %.0.i11, ptr addrspace(3) %244, align 4, !dbg !95 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !95 + %455 = load float, ptr addrspace(3) %246, align 4, !dbg !95 + %456 = fmul float %455, %376, !dbg !95 + %457 = fmul float %455, %374, !dbg !95 + %458 = fmul float %455, %372, !dbg !95 + %459 = fmul float %455, %370, !dbg !95 + %460 = getelementptr bfloat, ptr addrspace(1) %6, i64 %248, !dbg !96 + %461 = getelementptr bfloat, ptr addrspace(1) %6, i64 %250, !dbg !96 + %462 = getelementptr bfloat, ptr addrspace(1) %6, i64 %252, !dbg !96 + %463 = getelementptr bfloat, ptr addrspace(1) %6, i64 %254, !dbg !96 + %464 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %465 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %460, i64 %464, i1 %197) #6, !dbg !97 + %466 = bitcast i16 %465 to bfloat, !dbg !97 + %467 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %468 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %461, i64 %467, i1 %197) #6, !dbg !97 + %469 = bitcast i16 %468 to bfloat, !dbg !97 + %470 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %471 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %462, i64 %470, i1 %197) #6, !dbg !97 + %472 = bitcast i16 %471 to bfloat, !dbg !97 + %473 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97 + %474 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %463, i64 %473, i1 %197) #6, !dbg !97 + %475 = bitcast i16 %474 to bfloat, !dbg !97 + %476 = fpext bfloat %466 to float, !dbg !98 + %477 = fpext bfloat %469 to float, !dbg !98 + %478 = fpext bfloat %472 to float, !dbg !98 + %479 = fpext bfloat %475 to float, !dbg !98 + %480 = fmul float %456, %476, !dbg !99 + %481 = fmul float %457, %477, !dbg !99 + %482 = fmul float %458, %478, !dbg !99 + %483 = fmul float %459, %479, !dbg !99 + %484 = fsub float 0.000000e+00, %480, !dbg !100 + %485 = fsub float 0.000000e+00, %481, !dbg !100 + %486 = fsub float 0.000000e+00, %482, !dbg !100 + %487 = fsub float 0.000000e+00, %483, !dbg !100 + %488 = add i32 %208, 4096, !dbg !101 + %489 = or disjoint i32 %488, %198, !dbg !102 + %490 = add i32 %208, 4128, !dbg !101 + %491 = or disjoint i32 %490, %198, !dbg !102 + %492 = add i32 %208, 4160, !dbg !101 + %493 = or disjoint i32 %492, %198, !dbg !102 + %494 = add i32 %208, 4192, !dbg !101 + %495 = or disjoint i32 %494, %198, !dbg !102 + %496 = sext i32 %489 to i64, !dbg !103 + %497 = getelementptr bfloat, ptr addrspace(1) %2, i64 %496, !dbg !103 + %498 = sext i32 %491 to i64, !dbg !103 + %499 = getelementptr bfloat, ptr addrspace(1) %2, i64 %498, !dbg !103 + %500 = sext i32 %493 to i64, !dbg !103 + %501 = getelementptr bfloat, ptr addrspace(1) %2, i64 %500, !dbg !103 + %502 = sext i32 %495 to i64, !dbg !103 + %503 = getelementptr bfloat, ptr addrspace(1) %2, i64 %502, !dbg !103 + %504 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104 + %505 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %497, i64 %504, i1 %264) #6, !dbg !104 + %506 = bitcast i16 %505 to bfloat, !dbg !104 + %507 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104 + %508 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %499, i64 %507, i1 %264) #6, !dbg !104 + %509 = bitcast i16 %508 to bfloat, !dbg !104 + %510 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104 + %511 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %501, i64 %510, i1 %264) #6, !dbg !104 + %512 = bitcast i16 %511 to bfloat, !dbg !104 + %513 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104 + %514 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %503, i64 %513, i1 %264) #6, !dbg !104 + %515 = bitcast i16 %514 to bfloat, !dbg !104 + %516 = fpext bfloat %506 to float, !dbg !105 + %517 = fpext bfloat %509 to float, !dbg !105 + %518 = fpext bfloat %512 to float, !dbg !105 + %519 = fpext bfloat %515 to float, !dbg !105 + %520 = fmul float %455, %516, !dbg !106 + %521 = fmul float %455, %517, !dbg !106 + %522 = fmul float %455, %518, !dbg !106 + %523 = fmul float %455, %519, !dbg !106 + %524 = getelementptr bfloat, ptr addrspace(1) %6, i64 %285, !dbg !107 + %525 = getelementptr bfloat, ptr addrspace(1) %6, i64 %287, !dbg !107 + %526 = getelementptr bfloat, ptr addrspace(1) %6, i64 %289, !dbg !107 + %527 = getelementptr bfloat, ptr addrspace(1) %6, i64 %291, !dbg !107 + %528 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108 + %529 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %524, i64 %528, i1 %264) #6, !dbg !108 + %530 = bitcast i16 %529 to bfloat, !dbg !108 + %531 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108 + %532 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %525, i64 %531, i1 %264) #6, !dbg !108 + %533 = bitcast i16 %532 to bfloat, !dbg !108 + %534 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108 + %535 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %526, i64 %534, i1 %264) #6, !dbg !108 + %536 = bitcast i16 %535 to bfloat, !dbg !108 + %537 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108 + %538 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %527, i64 %537, i1 %264) #6, !dbg !108 + %539 = bitcast i16 %538 to bfloat, !dbg !108 + %540 = fpext bfloat %530 to float, !dbg !109 + %541 = fpext bfloat %533 to float, !dbg !109 + %542 = fpext bfloat %536 to float, !dbg !109 + %543 = fpext bfloat %539 to float, !dbg !109 + %544 = fmul float %520, %540, !dbg !110 + %545 = fmul float %521, %541, !dbg !110 + %546 = fmul float %522, %542, !dbg !110 + %547 = fmul float %523, %543, !dbg !110 + %548 = select i1 %197, float %484, float %544, !dbg !90 + %549 = select i1 %197, float %485, float %545, !dbg !90 + %550 = select i1 %197, float %486, float %546, !dbg !90 + %551 = select i1 %197, float %487, float %547, !dbg !90 + %552 = fmul float %.0.i20, %454, !dbg !111 + %553 = fmul float %.0.i20, %452, !dbg !111 + %554 = fmul float %.0.i20, %448, !dbg !111 + %555 = fmul float %.0.i20, %446, !dbg !111 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !111 + store float %552, ptr addrspace(3) %150, align 4, !dbg !111 + store float %553, ptr addrspace(3) %153, align 4, !dbg !111 + store float %554, ptr addrspace(3) %156, align 4, !dbg !111 + store float %555, ptr addrspace(3) %159, align 4, !dbg !111 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !111 + %556 = load float, ptr addrspace(3) %166, align 4, !dbg !111 + %557 = load float, ptr addrspace(3) %169, align 4, !dbg !111 + %558 = load float, ptr addrspace(3) %172, align 4, !dbg !111 + %559 = load float, ptr addrspace(3) %175, align 4, !dbg !111 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112 + %560 = shl nuw nsw i32 %15, 2, !dbg !112 + %561 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %560, !dbg !112 + store float %442, ptr addrspace(3) %561, align 4, !dbg !112 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112 + %562 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %23, !dbg !112 + %563 = load float, ptr addrspace(3) %562, align 4, !dbg !112 + %564 = getelementptr inbounds nuw i8, ptr addrspace(3) %562, i32 128, !dbg !112 + %565 = load float, ptr addrspace(3) %564, align 4, !dbg !112 + %566 = getelementptr inbounds nuw i8, ptr addrspace(3) %562, i32 256, !dbg !112 + %567 = load float, ptr addrspace(3) %566, align 4, !dbg !112 + %568 = getelementptr inbounds nuw i8, ptr addrspace(3) %562, i32 384, !dbg !112 + %569 = load float, ptr addrspace(3) %568, align 4, !dbg !112 + %570 = fmul float %556, %563, !dbg !113 + %571 = fmul float %557, %565, !dbg !113 + %572 = fmul float %558, %567, !dbg !113 + %573 = fmul float %559, %569, !dbg !113 + %574 = fmul float %167, %570, !dbg !112 + %575 = fmul float %170, %571, !dbg !112 + %576 = fmul float %173, %572, !dbg !112 + %577 = fmul float %176, %573, !dbg !112 + %578 = fmul float %188, %548, !dbg !114 + %579 = fmul float %189, %549, !dbg !114 + %580 = fmul float %190, %550, !dbg !114 + %581 = fmul float %191, %551, !dbg !114 + %582 = fadd float %578, %574, !dbg !115 + %583 = fadd float %579, %575, !dbg !115 + %584 = fadd float %580, %576, !dbg !115 + %585 = fadd float %581, %577, !dbg !115 + %586 = shl i32 %19, 7, !dbg !116 + %587 = or disjoint i32 %586, %22, !dbg !117 + %588 = sext i32 %587 to i64, !dbg !118 + %589 = getelementptr bfloat, ptr addrspace(1) %0, i64 %588, !dbg !118 + %590 = fptrunc float %440 to bfloat, !dbg !119 + %591 = fptrunc float %424 to bfloat, !dbg !119 + %592 = fptrunc float %408 to bfloat, !dbg !119 + %593 = fptrunc float %392 to bfloat, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + %594 = shl nuw nsw i32 %142, 7, !dbg !119 + %595 = lshr i32 %14, 4, !dbg !119 + %596 = and i32 %595, 2, !dbg !119 + %597 = and i32 %24, 16, !dbg !119 + %598 = or disjoint i32 %594, %596, !dbg !119 + %599 = or disjoint i32 %143, %146, !dbg !119 + %600 = xor i32 %599, %597, !dbg !119 + %601 = or disjoint i32 %600, %598, !dbg !119 + %602 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %601, !dbg !119 + store bfloat %590, ptr addrspace(3) %602, align 2, !dbg !119 + %603 = xor i32 %601, 32, !dbg !119 + %604 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %603, !dbg !119 + store bfloat %591, ptr addrspace(3) %604, align 2, !dbg !119 + %605 = xor i32 %601, 64, !dbg !119 + %606 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %605, !dbg !119 + store bfloat %592, ptr addrspace(3) %606, align 2, !dbg !119 + %607 = xor i32 %601, 96, !dbg !119 + %608 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %607, !dbg !119 + store bfloat %593, ptr addrspace(3) %608, align 2, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + %609 = shl nuw nsw i32 %23, 2, !dbg !119 + %610 = lshr exact i32 %16, 1, !dbg !119 + %611 = shl nuw nsw i32 %14, 3, !dbg !119 + %612 = and i32 %611, 8, !dbg !119 + %613 = and i32 %14, 2, !dbg !119 + %614 = or disjoint i32 %612, %613, !dbg !119 + %615 = xor i32 %609, %610, !dbg !119 + %616 = or disjoint i32 %614, %615, !dbg !119 + %617 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %616, !dbg !119 + %618 = load bfloat, ptr addrspace(3) %617, align 2, !dbg !119 + %619 = getelementptr inbounds nuw i8, ptr addrspace(3) %617, i32 4, !dbg !119 + %620 = load bfloat, ptr addrspace(3) %619, align 2, !dbg !119 + %621 = xor i32 %616, 576, !dbg !119 + %622 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %621, !dbg !119 + %623 = load bfloat, ptr addrspace(3) %622, align 2, !dbg !119 + %624 = getelementptr inbounds nuw i8, ptr addrspace(3) %622, i32 4, !dbg !119 + %625 = load bfloat, ptr addrspace(3) %624, align 2, !dbg !119 + %626 = insertelement <2 x bfloat> poison, bfloat %618, i64 0, !dbg !119 + %627 = insertelement <2 x bfloat> %626, bfloat %623, i64 1, !dbg !119 + %628 = bitcast <2 x bfloat> %627 to i32, !dbg !119 + %629 = insertelement <2 x bfloat> poison, bfloat %620, i64 0, !dbg !119 + %630 = insertelement <2 x bfloat> %629, bfloat %625, i64 1, !dbg !119 + %631 = bitcast <2 x bfloat> %630 to i32, !dbg !119 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %628, i32 %631, ptr addrspace(1) %589, i1 true) #6, !dbg !119 + %632 = getelementptr bfloat, ptr addrspace(1) %1, i64 %588, !dbg !120 + %633 = fptrunc float %582 to bfloat, !dbg !121 + %634 = fptrunc float %583 to bfloat, !dbg !121 + %635 = fptrunc float %584 to bfloat, !dbg !121 + %636 = fptrunc float %585 to bfloat, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + store bfloat %633, ptr addrspace(3) %602, align 2, !dbg !121 + store bfloat %634, ptr addrspace(3) %604, align 2, !dbg !121 + store bfloat %635, ptr addrspace(3) %606, align 2, !dbg !121 + store bfloat %636, ptr addrspace(3) %608, align 2, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + %637 = load bfloat, ptr addrspace(3) %617, align 2, !dbg !121 + %638 = load bfloat, ptr addrspace(3) %619, align 2, !dbg !121 + %639 = load bfloat, ptr addrspace(3) %622, align 2, !dbg !121 + %640 = load bfloat, ptr addrspace(3) %624, align 2, !dbg !121 + %641 = insertelement <2 x bfloat> poison, bfloat %637, i64 0, !dbg !121 + %642 = insertelement <2 x bfloat> %641, bfloat %639, i64 1, !dbg !121 + %643 = bitcast <2 x bfloat> %642 to i32, !dbg !121 + %644 = insertelement <2 x bfloat> poison, bfloat %638, i64 0, !dbg !121 + %645 = insertelement <2 x bfloat> %644, bfloat %640, i64 1, !dbg !121 + %646 = bitcast <2 x bfloat> %645 to i32, !dbg !121 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %643, i32 %646, ptr addrspace(1) %632, i1 true) #6, !dbg !121 + ret void, !dbg !122 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 23, column: 33, scope: !5) +!10 = !DILocation(line: 24, column: 44, scope: !5) +!11 = !DILocation(line: 24, column: 23, scope: !5) +!12 = !DILocation(line: 26, column: 37, scope: !5) +!13 = !DILocation(line: 29, column: 19, scope: !5) +!14 = !DILocation(line: 28, column: 19, scope: !5) +!15 = !DILocation(line: 39, column: 41, scope: !5) +!16 = !DILocation(line: 39, column: 52, scope: !5) +!17 = !DILocation(line: 39, column: 48, scope: !5) +!18 = !DILocation(line: 39, column: 63, scope: !5) +!19 = !DILocation(line: 39, column: 57, scope: !5) +!20 = !DILocation(line: 39, column: 34, scope: !5) +!21 = !DILocation(line: 39, column: 68, scope: !5) +!22 = !DILocation(line: 39, column: 121, scope: !5) +!23 = !DILocation(line: 40, column: 41, scope: !5) +!24 = !DILocation(line: 40, column: 50, scope: !5) +!25 = !DILocation(line: 40, column: 34, scope: !5) +!26 = !DILocation(line: 40, column: 61, scope: !5) +!27 = !DILocation(line: 40, column: 114, scope: !5) +!28 = !DILocation(line: 42, column: 22, scope: !5) +!29 = !DILocation(line: 47, column: 22, scope: !5) +!30 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !33) +!31 = distinct !DILexicalBlockFile(scope: !5, file: !32, discriminator: 0) +!32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!33 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !34) +!34 = !DILocation(line: 51, column: 25, scope: !35) +!35 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!36 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !37) +!37 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !38) +!38 = !DILocation(line: 52, column: 27, scope: !35) +!39 = !DILocation(line: 58, column: 27, scope: !5) +!40 = !DILocation(line: 62, column: 35, scope: !5) +!41 = !DILocation(line: 62, column: 42, scope: !5) +!42 = !DILocation(line: 62, column: 95, scope: !5) +!43 = !DILocation(line: 63, column: 46, scope: !5) +!44 = !DILocation(line: 63, column: 42, scope: !5) +!45 = !DILocation(line: 63, column: 35, scope: !5) +!46 = !DILocation(line: 63, column: 51, scope: !5) +!47 = !DILocation(line: 64, column: 35, scope: !5) +!48 = !DILocation(line: 64, column: 51, scope: !5) +!49 = !DILocation(line: 65, column: 69, scope: !5) +!50 = !DILocation(line: 66, column: 36, scope: !5) +!51 = !DILocation(line: 66, column: 43, scope: !5) +!52 = !DILocation(line: 71, column: 24, scope: !5) +!53 = !DILocation(line: 72, column: 41, scope: !5) +!54 = !DILocation(line: 72, column: 39, scope: !5) +!55 = !DILocation(line: 72, column: 48, scope: !5) +!56 = !DILocation(line: 72, column: 57, scope: !5) +!57 = !DILocation(line: 72, column: 35, scope: !5) +!58 = !DILocation(line: 72, column: 68, scope: !5) +!59 = !DILocation(line: 75, column: 25, scope: !5) +!60 = !DILocation(line: 77, column: 24, scope: !5) +!61 = !DILocation(line: 78, column: 32, scope: !5) +!62 = !DILocation(line: 79, column: 24, scope: !5) +!63 = !DILocation(line: 80, column: 35, scope: !5) +!64 = !DILocation(line: 80, column: 85, scope: !5) +!65 = !DILocation(line: 87, column: 25, scope: !5) +!66 = !DILocation(line: 90, column: 53, scope: !5) +!67 = !DILocation(line: 90, column: 35, scope: !5) +!68 = !DILocation(line: 90, column: 64, scope: !5) +!69 = !DILocation(line: 98, column: 35, scope: !5) +!70 = !DILocation(line: 98, column: 81, scope: !5) +!71 = !DILocation(line: 111, column: 24, scope: !5) +!72 = !DILocation(line: 113, column: 24, scope: !5) +!73 = !DILocation(line: 116, column: 24, scope: !5) +!74 = !DILocation(line: 121, column: 51, scope: !5) +!75 = !DILocation(line: 121, column: 60, scope: !5) +!76 = !DILocation(line: 121, column: 35, scope: !5) +!77 = !DILocation(line: 121, column: 71, scope: !5) +!78 = !DILocation(line: 123, column: 24, scope: !5) +!79 = !DILocation(line: 124, column: 24, scope: !5) +!80 = !DILocation(line: 125, column: 32, scope: !5) +!81 = !DILocation(line: 121, column: 132, scope: !5) +!82 = !DILocation(line: 72, column: 129, scope: !5) +!83 = !DILocation(line: 80, column: 146, scope: !5) +!84 = !DILocation(line: 82, column: 24, scope: !5) +!85 = !DILocation(line: 84, column: 17, scope: !5) +!86 = !DILocation(line: 90, column: 125, scope: !5) +!87 = !DILocation(line: 97, column: 24, scope: !5) +!88 = !DILocation(line: 98, column: 142, scope: !5) +!89 = !DILocation(line: 100, column: 24, scope: !5) +!90 = !DILocation(line: 0, scope: !5) +!91 = !DILocation(line: 118, column: 24, scope: !5) +!92 = !DILocation(line: 119, column: 24, scope: !5) +!93 = !DILocation(line: 66, column: 96, scope: !5) +!94 = !DILocation(line: 65, column: 123, scope: !5) +!95 = !DILocation(line: 126, column: 24, scope: !5) +!96 = !DILocation(line: 127, column: 35, scope: !5) +!97 = !DILocation(line: 127, column: 85, scope: !5) +!98 = !DILocation(line: 127, column: 146, scope: !5) +!99 = !DILocation(line: 129, column: 24, scope: !5) +!100 = !DILocation(line: 131, column: 17, scope: !5) +!101 = !DILocation(line: 134, column: 51, scope: !5) +!102 = !DILocation(line: 134, column: 60, scope: !5) +!103 = !DILocation(line: 134, column: 35, scope: !5) +!104 = !DILocation(line: 134, column: 71, scope: !5) +!105 = !DILocation(line: 134, column: 132, scope: !5) +!106 = !DILocation(line: 139, column: 24, scope: !5) +!107 = !DILocation(line: 140, column: 35, scope: !5) +!108 = !DILocation(line: 140, column: 81, scope: !5) +!109 = !DILocation(line: 140, column: 142, scope: !5) +!110 = !DILocation(line: 142, column: 24, scope: !5) +!111 = !DILocation(line: 151, column: 25, scope: !5) +!112 = !DILocation(line: 156, column: 26, scope: !5) +!113 = !DILocation(line: 153, column: 26, scope: !5) +!114 = !DILocation(line: 158, column: 26, scope: !5) +!115 = !DILocation(line: 159, column: 26, scope: !5) +!116 = !DILocation(line: 161, column: 43, scope: !5) +!117 = !DILocation(line: 161, column: 39, scope: !5) +!118 = !DILocation(line: 161, column: 32, scope: !5) +!119 = !DILocation(line: 161, column: 55, scope: !5) +!120 = !DILocation(line: 162, column: 32, scope: !5) +!121 = !DILocation(line: 162, column: 56, scope: !5) +!122 = !DILocation(line: 53, column: 4, scope: !5) diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..16e9f5aa3056e5a206c9254072fb8f955811fff4 --- /dev/null +++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx @@ -0,0 +1,1422 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10 +) +.reqntid 128 +{ + .reg .pred %p<4>; + .reg .b16 %rs<64>; + .reg .b32 %r<320>; + .reg .b64 %rd<96>; + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd80, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0]; + ld.param.b64 %rd81, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28 + mov.u32 %r20, %ctaid.x; + .loc 1 23 33 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33 + shl.b32 %r21, %r20, 2; + ld.param.b64 %rd82, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2]; + ld.param.b64 %rd83, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3]; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + mov.u32 %r22, %tid.x; + and.b32 %r23, %r22, 127; + ld.param.b64 %rd84, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4]; + and.b32 %r24, %r22, 96; + ld.param.b64 %rd85, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5]; + bfe.u32 %r25, %r22, 5, 2; + ld.param.b64 %rd86, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6]; + and.b32 %r26, %r22, 3; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r27, %r25, %r21; + or.b32 %r28, %r21, %r26; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + shl.b32 %r29, %r22, 2; + and.b32 %r30, %r29, 124; + and.b32 %r31, %r22, 124; + shr.u32 %r32, %r22, 2; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + bfe.s32 %r33, %r20, 29, 1; + shr.u32 %r34, %r33, 27; + add.s32 %r35, %r27, %r34; + shr.s32 %r36, %r35, 5; + .loc 1 28 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:28:19 + and.b32 %r37, %r35, 33554400; + sub.s32 %r38, %r27, %r37; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + add.s32 %r39, %r28, %r34; + .loc 1 39 52 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:52 + shl.b32 %r40, %r38, 7; + .loc 1 39 48 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:48 + or.b32 %r41, %r40, %r30; + mad.lo.s32 %r42, %r36, 36864, %r41; + .loc 1 39 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57 + add.s32 %r43, %r42, 4096; + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + mad.wide.s32 %rd1, %r43, 2, %rd82; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs36, %rs37}, %r1; + mov.b32 {%rs38, %rs39}, %r2; + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r44, %rs36; + cvt.f32.bf16 %r45, %rs37; + cvt.f32.bf16 %r46, %rs38; + cvt.f32.bf16 %r47, %rs39; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + mad.wide.s32 %rd3, %r42, 2, %rd82; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4, %r3; + mov.u32 %r5, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4; + // end inline asm + mov.b32 {%rs40, %rs41}, %r4; + mov.b32 {%rs42, %rs43}, %r5; + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r48, %rs40; + cvt.f32.bf16 %r49, %rs41; + cvt.f32.bf16 %r50, %rs42; + cvt.f32.bf16 %r51, %rs43; + .loc 1 42 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22 + mul.f32 %r52, %r45, %r45; + .loc 1 47 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22 + mul.f32 %r53, %r49, %r49; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + fma.rn.f32 %r54, %r44, %r44, %r52; + fma.rn.f32 %r55, %r46, %r46, %r54; + fma.rn.f32 %r56, %r47, %r47, %r55; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r57, %r56, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r58, %r56, %r57; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r59, %r58, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r60, %r58, %r59; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r61, %r60, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r62, %r60, %r61; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r63, %r62, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r64, %r62, %r63; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r65, %r64, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r66, %r64, %r65; +$L__tmp12: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + fma.rn.f32 %r67, %r48, %r48, %r53; + fma.rn.f32 %r68, %r50, %r50, %r67; + fma.rn.f32 %r69, %r51, %r51, %r68; +$L__tmp13: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r70, %r69, 16, 31, -1; +$L__tmp14: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r71, %r69, %r70; +$L__tmp15: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r72, %r71, 8, 31, -1; +$L__tmp16: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r73, %r71, %r72; +$L__tmp17: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r74, %r73, 4, 31, -1; +$L__tmp18: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r75, %r73, %r74; +$L__tmp19: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r76, %r75, 2, 31, -1; +$L__tmp20: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r77, %r75, %r76; +$L__tmp21: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r78, %r77, 1, 31, -1; +$L__tmp22: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r79, %r77, %r78; +$L__tmp23: + .loc 1 62 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35 + mul.wide.u32 %rd87, %r23, 2; + add.s64 %rd5, %rd83, %rd87; + .loc 1 62 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + mov.b16 %rs2, 0; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 62 95 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95 + cvt.f32.bf16 %r80, %rs1; + .loc 1 63 46 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46 + shl.b32 %r81, %r36, 7; + .loc 1 63 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42 + or.b32 %r82, %r81, %r30; + .loc 1 63 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35 + mul.wide.s32 %rd88, %r82, 4; + add.s64 %rd7, %rd84, %rd88; + .loc 1 63 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51 + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r3; + mov.u32 %r7, %r3; + mov.u32 %r8, %r3; + mov.u32 %r9, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd7 + 0 ], %rd8; + // end inline asm + and.b32 %r83, %r22, 7; + shl.b32 %r84, %r83, 4; + shl.b32 %r85, %r24, 2; + shr.u32 %r86, %r22, 1; + and.b32 %r87, %r86, 12; + or.b32 %r88, %r84, %r85; + or.b32 %r89, %r87, %r24; + xor.b32 %r90, %r88, %r89; + mov.b32 %r91, global_smem; + add.s32 %r92, %r91, %r90; + st.shared.b32 [%r92], %r6; + xor.b32 %r93, %r90, 4; + add.s32 %r94, %r91, %r93; + st.shared.b32 [%r94+512], %r7; + xor.b32 %r95, %r90, 8; + add.s32 %r96, %r91, %r95; + st.shared.b32 [%r96+1024], %r8; + xor.b32 %r97, %r90, 12; + add.s32 %r98, %r91, %r97; + st.shared.b32 [%r98+1536], %r9; + bar.sync 0; + shl.b32 %r99, %r22, 7; + and.b32 %r100, %r99, 1920; + shl.b32 %r101, %r26, 5; + xor.b32 %r102, %r101, %r31; + or.b32 %r103, %r102, %r100; + add.s32 %r104, %r91, %r103; + ld.shared.b32 %r105, [%r104]; + xor.b32 %r106, %r103, 4; + add.s32 %r107, %r91, %r106; + ld.shared.b32 %r108, [%r107]; + xor.b32 %r109, %r103, 8; + add.s32 %r110, %r91, %r109; + ld.shared.b32 %r111, [%r110]; + xor.b32 %r112, %r103, 12; + add.s32 %r113, %r91, %r112; + ld.shared.b32 %r114, [%r113]; + .loc 1 64 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35 + add.s64 %rd9, %rd85, %rd88; + .loc 1 64 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51 + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r3; + mov.u32 %r11, %r3; + mov.u32 %r12, %r3; + mov.u32 %r13, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd9 + 0 ], %rd10; + // end inline asm + bar.sync 0; + st.shared.b32 [%r92], %r10; + st.shared.b32 [%r94+512], %r11; + st.shared.b32 [%r96+1024], %r12; + st.shared.b32 [%r98+1536], %r13; + bar.sync 0; + ld.shared.b32 %r115, [%r104]; + ld.shared.b32 %r116, [%r107]; + ld.shared.b32 %r117, [%r110]; + ld.shared.b32 %r118, [%r113]; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r14, %r3; + mov.u32 %r15, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r14, %r15 }, [ %rd1 + 0 ], %rd11; + // end inline asm + .loc 1 66 36 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36 + add.s64 %rd12, %rd86, %rd87; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd12 + 0 ], %rd13; + // end inline asm + .loc 1 71 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:71:24 + and.b32 %r119, %r32, 1; + setp.ne.b32 %p3, %r119, 0; + not.pred %p2, %p3; + .loc 1 72 41 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41 + and.b32 %r120, %r32, 30; + .loc 1 72 48 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:48 + shl.b32 %r121, %r28, 7; + shl.b32 %r122, %r39, 10; + and.b32 %r123, %r122, -32768; + add.s32 %r124, %r123, %r121; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + cvt.s64.s32 %rd89, %r124; + cvt.u64.u32 %rd90, %r120; + or.b64 %rd91, %rd89, %rd90; + shl.b64 %rd92, %rd91, 1; + add.s64 %rd93, %rd82, %rd92; + add.s64 %rd14, %rd93, 2; + add.s64 %rd16, %rd93, 66; + add.s64 %rd18, %rd93, 130; + add.s64 %rd20, %rd93, 194; + .loc 1 72 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs4, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd14 + 0 ], %rd15; + // end inline asm + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd16 + 0 ], %rd17; + // end inline asm + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs6, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd18 + 0 ], %rd19; + // end inline asm + // begin inline asm + mov.u64 %rd21, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs7, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd20 + 0 ], %rd21; + // end inline asm + mov.b32 %r125, 0f43000000; + .loc 1 75 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25 + div.full.f32 %r126, %r79, %r125; + .loc 1 77 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24 + add.f32 %r127, %r126, 0f358637BD; + .loc 1 78 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32 + rsqrt.approx.ftz.f32 %r128, %r127; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + bar.sync 0; + shr.u32 %r129, %r24, 3; + add.s32 %r130, %r91, %r129; + st.shared.b32 [%r130], %r128; + bar.sync 0; + shl.b32 %r131, %r26, 2; + add.s32 %r132, %r91, %r131; + ld.shared.b32 %r133, [%r132]; + .loc 1 80 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35 + mul.wide.u32 %rd94, %r120, 2; + add.s64 %rd38, %rd83, %rd94; + add.s64 %rd22, %rd38, 2; + add.s64 %rd24, %rd38, 66; + add.s64 %rd26, %rd38, 130; + add.s64 %rd28, %rd38, 194; + .loc 1 80 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs8, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd22 + 0 ], %rd23; + // end inline asm + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs9, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd24 + 0 ], %rd25; + // end inline asm + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs10, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd26 + 0 ], %rd27; + // end inline asm + // begin inline asm + mov.u64 %rd29, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd28 + 0 ], %rd29; + // end inline asm + .loc 1 90 53 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53 + or.b32 %r134, %r124, %r120; + .loc 1 90 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35 + mad.wide.s32 %rd30, %r134, 2, %rd82; + add.s64 %rd32, %rd93, 64; + add.s64 %rd34, %rd93, 128; + add.s64 %rd36, %rd93, 192; + .loc 1 90 64 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd30 + 0 ], %rd31; + // end inline asm + // begin inline asm + mov.u64 %rd33, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd32 + 0 ], %rd33; + // end inline asm + // begin inline asm + mov.u64 %rd35, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs14, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd34 + 0 ], %rd35; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd36 + 0 ], %rd37; + // end inline asm + .loc 1 98 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35 + add.s64 %rd40, %rd38, 64; + add.s64 %rd42, %rd38, 128; + add.s64 %rd44, %rd38, 192; + .loc 1 98 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81 + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs16, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd38 + 0 ], %rd39; + // end inline asm + // begin inline asm + mov.u64 %rd41, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd40 + 0 ], %rd41; + // end inline asm + // begin inline asm + mov.u64 %rd43, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs18, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd42 + 0 ], %rd43; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd44 + 0 ], %rd45; + // end inline asm + .loc 1 111 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24 + mul.f32 %r135, %r128, %r48; + mul.f32 %r136, %r128, %r49; + mul.f32 %r137, %r128, %r50; + mul.f32 %r138, %r128, %r51; + .loc 1 113 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24 + bar.sync 0; + mad.lo.s32 %r139, %r26, 124, %r132; + add.s32 %r140, %r139, %r102; + st.shared.b32 [%r140], %r80; + bar.sync 0; + add.s32 %r141, %r91, %r30; + ld.shared.b32 %r142, [%r141]; + xor.b32 %r143, %r30, 32; + add.s32 %r144, %r91, %r143; + ld.shared.b32 %r145, [%r144+128]; + xor.b32 %r146, %r30, 64; + add.s32 %r147, %r91, %r146; + ld.shared.b32 %r148, [%r147+256]; + xor.b32 %r149, %r30, 96; + add.s32 %r150, %r91, %r149; + ld.shared.b32 %r151, [%r150+384]; + mul.f32 %r152, %r135, %r142; + mul.f32 %r153, %r136, %r145; + mul.f32 %r154, %r137, %r148; + mul.f32 %r155, %r138, %r151; + .loc 1 116 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24 + mul.f32 %r156, %r152, %r6; + mul.f32 %r157, %r153, %r7; + mul.f32 %r158, %r154, %r8; + mul.f32 %r159, %r155, %r9; + bar.sync 0; + st.shared.b32 [%r92], %r156; + st.shared.b32 [%r94+512], %r157; + st.shared.b32 [%r96+1024], %r158; + st.shared.b32 [%r98+1536], %r159; + bar.sync 0; + ld.shared.b32 %r160, [%r104]; + ld.shared.b32 %r161, [%r107]; + ld.shared.b32 %r162, [%r110]; + ld.shared.b32 %r163, [%r113]; + .loc 1 121 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60 + add.s32 %r164, %r134, 4097; + add.s32 %r165, %r134, 4129; + add.s32 %r166, %r134, 4161; + add.s32 %r167, %r134, 4193; + .loc 1 121 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35 + mad.wide.s32 %rd46, %r164, 2, %rd82; + mad.wide.s32 %rd48, %r165, 2, %rd82; + mad.wide.s32 %rd50, %r166, 2, %rd82; + mad.wide.s32 %rd52, %r167, 2, %rd82; + .loc 1 121 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71 + // begin inline asm + mov.u64 %rd47, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs20, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd46 + 0 ], %rd47; + // end inline asm + // begin inline asm + mov.u64 %rd49, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd48 + 0 ], %rd49; + // end inline asm + // begin inline asm + mov.u64 %rd51, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs22, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd50 + 0 ], %rd51; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd52 + 0 ], %rd53; + // end inline asm + .loc 1 123 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24 + div.full.f32 %r168, %r66, %r125; + .loc 1 124 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24 + add.f32 %r169, %r168, 0f358637BD; + .loc 1 125 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32 + rsqrt.approx.ftz.f32 %r170, %r169; + .loc 1 121 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132 + cvt.f32.bf16 %r171, %rs23; + cvt.f32.bf16 %r172, %rs22; + cvt.f32.bf16 %r173, %rs21; + cvt.f32.bf16 %r174, %rs20; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r175, %rs7; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r176, %r133, %r175; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r177, %rs11; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r178, %r176; + fma.rn.f32 %r179, %r178, %r177, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r180, %rs15; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r181, %r133, %r180; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r182, %rs19; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r183, %r181, %r182; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r184, %r183, %r179, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r185, %r118, %r184, %r163; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r186, %rs6; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r187, %r133, %r186; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r188, %rs10; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r189, %r187; + fma.rn.f32 %r190, %r189, %r188, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r191, %rs14; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r192, %r133, %r191; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r193, %rs18; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r194, %r192, %r193; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r195, %r194, %r190, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r196, %r117, %r195, %r162; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r197, %rs5; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r198, %r133, %r197; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r199, %rs9; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r200, %r198; + fma.rn.f32 %r201, %r200, %r199, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r202, %rs13; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r203, %r133, %r202; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r204, %rs17; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r205, %r203, %r204; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r206, %r205, %r201, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r207, %r116, %r206, %r161; + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r208, %rs4; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r209, %r133, %r208; + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r210, %rs8; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r211, %r209; + fma.rn.f32 %r212, %r211, %r210, 0f00000000; + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r213, %rs12; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r214, %r133, %r213; + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r215, %rs16; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r216, %r214, %r215; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r217, %r216, %r212, %p3; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r218, %r115, %r217, %r160; + .loc 1 66 96 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96 + cvt.f32.bf16 %r219, %rs3; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + mov.b32 {%rs44, %rs45}, %r15; + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r220, %rs45; + cvt.f32.bf16 %r221, %rs44; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + mov.b32 {%rs46, %rs47}, %r14; + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r222, %rs47; + cvt.f32.bf16 %r223, %rs46; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + bar.sync 0; + st.shared.b32 [%r130], %r170; + bar.sync 0; + ld.shared.b32 %r224, [%r132]; + mul.f32 %r225, %r224, %r174; + mul.f32 %r226, %r224, %r173; + mul.f32 %r227, %r224, %r172; + mul.f32 %r228, %r224, %r171; + .loc 1 127 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35 + add.s64 %rd70, %rd86, %rd94; + add.s64 %rd54, %rd70, 2; + add.s64 %rd56, %rd70, 66; + add.s64 %rd58, %rd70, 130; + add.s64 %rd60, %rd70, 194; + .loc 1 127 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85 + // begin inline asm + mov.u64 %rd55, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs24, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd54 + 0 ], %rd55; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs25, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd56 + 0 ], %rd57; + // end inline asm + // begin inline asm + mov.u64 %rd59, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd59, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs26, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd58 + 0 ], %rd59; + // end inline asm + // begin inline asm + mov.u64 %rd61, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd61, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd60 + 0 ], %rd61; + // end inline asm + .loc 1 127 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146 + cvt.f32.bf16 %r229, %rs24; + cvt.f32.bf16 %r230, %rs25; + cvt.f32.bf16 %r231, %rs26; + cvt.f32.bf16 %r232, %rs27; + .loc 1 131 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17 + neg.f32 %r233, %r225; + fma.rn.f32 %r234, %r233, %r229, 0f00000000; + neg.f32 %r235, %r226; + fma.rn.f32 %r236, %r235, %r230, 0f00000000; + neg.f32 %r237, %r227; + fma.rn.f32 %r238, %r237, %r231, 0f00000000; + neg.f32 %r239, %r228; + fma.rn.f32 %r240, %r239, %r232, 0f00000000; + .loc 1 134 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60 + add.s32 %r241, %r134, 4096; + add.s32 %r242, %r134, 4128; + add.s32 %r243, %r134, 4160; + add.s32 %r244, %r134, 4192; + .loc 1 134 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35 + mad.wide.s32 %rd62, %r241, 2, %rd82; + mad.wide.s32 %rd64, %r242, 2, %rd82; + mad.wide.s32 %rd66, %r243, 2, %rd82; + mad.wide.s32 %rd68, %r244, 2, %rd82; + .loc 1 134 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71 + // begin inline asm + mov.u64 %rd63, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs28, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd62 + 0 ], %rd63; + // end inline asm + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs29, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd64 + 0 ], %rd65; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs30, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd66 + 0 ], %rd67; + // end inline asm + // begin inline asm + mov.u64 %rd69, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd69, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs31, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd68 + 0 ], %rd69; + // end inline asm + .loc 1 134 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132 + cvt.f32.bf16 %r245, %rs28; + cvt.f32.bf16 %r246, %rs29; + cvt.f32.bf16 %r247, %rs30; + cvt.f32.bf16 %r248, %rs31; + .loc 1 139 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24 + mul.f32 %r249, %r224, %r245; + mul.f32 %r250, %r224, %r246; + mul.f32 %r251, %r224, %r247; + mul.f32 %r252, %r224, %r248; + .loc 1 140 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35 + add.s64 %rd72, %rd70, 64; + add.s64 %rd74, %rd70, 128; + add.s64 %rd76, %rd70, 192; + .loc 1 140 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81 + // begin inline asm + mov.u64 %rd71, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs32, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd70 + 0 ], %rd71; + // end inline asm + // begin inline asm + mov.u64 %rd73, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd73, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs33, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd72 + 0 ], %rd73; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd75, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs34, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs34 }, [ %rd74 + 0 ], %rd75; + // end inline asm + // begin inline asm + mov.u64 %rd77, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd77, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs35, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs35 }, [ %rd76 + 0 ], %rd77; + // end inline asm + .loc 1 140 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142 + cvt.f32.bf16 %r253, %rs32; + cvt.f32.bf16 %r254, %rs33; + cvt.f32.bf16 %r255, %rs34; + cvt.f32.bf16 %r256, %rs35; + .loc 1 142 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24 + mul.f32 %r257, %r249, %r253; + mul.f32 %r258, %r250, %r254; + mul.f32 %r259, %r251, %r255; + mul.f32 %r260, %r252, %r256; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r261, %r257, %r234, %p3; + selp.f32 %r262, %r258, %r236, %p3; + selp.f32 %r263, %r259, %r238, %p3; + selp.f32 %r264, %r260, %r240, %p3; + .loc 1 151 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25 + mul.f32 %r265, %r170, %r223; + mul.f32 %r266, %r170, %r222; + mul.f32 %r267, %r170, %r221; + mul.f32 %r268, %r170, %r220; + bar.sync 0; + st.shared.b32 [%r92], %r265; + st.shared.b32 [%r94+512], %r266; + st.shared.b32 [%r96+1024], %r267; + st.shared.b32 [%r98+1536], %r268; + bar.sync 0; + ld.shared.b32 %r269, [%r104]; + ld.shared.b32 %r270, [%r107]; + ld.shared.b32 %r271, [%r110]; + ld.shared.b32 %r272, [%r113]; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + bar.sync 0; + shl.b32 %r273, %r23, 2; + add.s32 %r274, %r91, %r273; + st.shared.b32 [%r274], %r219; + bar.sync 0; + add.s32 %r275, %r91, %r31; + ld.shared.b32 %r276, [%r275]; + ld.shared.b32 %r277, [%r275+128]; + ld.shared.b32 %r278, [%r275+256]; + ld.shared.b32 %r279, [%r275+384]; + .loc 1 153 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26 + mul.f32 %r280, %r269, %r276; + mul.f32 %r281, %r270, %r277; + mul.f32 %r282, %r271, %r278; + mul.f32 %r283, %r272, %r279; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + mul.f32 %r284, %r105, %r280; + mul.f32 %r285, %r108, %r281; + mul.f32 %r286, %r111, %r282; + mul.f32 %r287, %r114, %r283; + .loc 1 159 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26 + fma.rn.f32 %r288, %r115, %r261, %r284; + fma.rn.f32 %r289, %r116, %r262, %r285; + fma.rn.f32 %r290, %r117, %r263, %r286; + fma.rn.f32 %r291, %r118, %r264, %r287; + .loc 1 161 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:43 + shl.b32 %r292, %r27, 7; + .loc 1 161 39 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39 + or.b32 %r293, %r292, %r30; + .loc 1 161 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32 + mul.wide.s32 %rd95, %r293, 2; + add.s64 %rd78, %rd80, %rd95; + .loc 1 161 55 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55 + cvt.rn.bf16.f32 %rs48, %r218; + cvt.rn.bf16.f32 %rs49, %r207; + cvt.rn.bf16.f32 %rs50, %r196; + cvt.rn.bf16.f32 %rs51, %r185; + bar.sync 0; + shl.b32 %r294, %r83, 7; + shr.u32 %r295, %r22, 4; + and.b32 %r296, %r295, 2; + and.b32 %r297, %r32, 16; + or.b32 %r298, %r294, %r296; + or.b32 %r299, %r84, %r87; + xor.b32 %r300, %r299, %r297; + or.b32 %r301, %r300, %r298; + add.s32 %r302, %r91, %r301; + st.shared.b16 [%r302], %rs48; + xor.b32 %r303, %r301, 32; + add.s32 %r304, %r91, %r303; + st.shared.b16 [%r304], %rs49; + xor.b32 %r305, %r301, 64; + add.s32 %r306, %r91, %r305; + st.shared.b16 [%r306], %rs50; + xor.b32 %r307, %r301, 96; + add.s32 %r308, %r91, %r307; + st.shared.b16 [%r308], %rs51; + bar.sync 0; + shl.b32 %r309, %r31, 2; + shr.u32 %r310, %r24, 1; + shl.b32 %r311, %r22, 3; + and.b32 %r312, %r311, 8; + and.b32 %r313, %r22, 2; + or.b32 %r314, %r312, %r313; + xor.b32 %r315, %r309, %r310; + or.b32 %r316, %r314, %r315; + add.s32 %r317, %r91, %r316; + ld.shared.b16 %rs52, [%r317]; + ld.shared.b16 %rs53, [%r317+4]; + xor.b32 %r318, %r316, 64; + add.s32 %r319, %r91, %r318; + ld.shared.b16 %rs54, [%r319+512]; + ld.shared.b16 %rs55, [%r319+516]; + mov.b32 %r16, {%rs52, %rs54}; + mov.b32 %r17, {%rs53, %rs55}; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd78 + 0 ], { %r16, %r17 }; + // end inline asm + .loc 1 162 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32 + add.s64 %rd79, %rd81, %rd95; + .loc 1 162 56 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56 + cvt.rn.bf16.f32 %rs56, %r288; + cvt.rn.bf16.f32 %rs57, %r289; + cvt.rn.bf16.f32 %rs58, %r290; + cvt.rn.bf16.f32 %rs59, %r291; + bar.sync 0; + st.shared.b16 [%r302], %rs56; + st.shared.b16 [%r304], %rs57; + st.shared.b16 [%r306], %rs58; + st.shared.b16 [%r308], %rs59; + bar.sync 0; + ld.shared.b16 %rs60, [%r317]; + ld.shared.b16 %rs61, [%r317+4]; + ld.shared.b16 %rs62, [%r319+512]; + ld.shared.b16 %rs63, [%r319+516]; + mov.b32 %r18, {%rs60, %rs62}; + mov.b32 %r19, {%rs61, %rs63}; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd79 + 0 ], { %r18, %r19 }; + // end inline asm + .loc 1 53 4 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4 + ret; +$L__tmp24: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 456 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 118 +.b8 113 +.b8 104 +.b8 106 +.b8 116 +.b8 121 +.b8 103 +.b8 55 +.b8 102 +.b8 118 +.b8 120 +.b8 122 +.b8 119 +.b8 116 +.b8 98 +.b8 116 +.b8 116 +.b8 52 +.b8 118 +.b8 114 +.b8 100 +.b8 107 +.b8 98 +.b8 110 +.b8 98 +.b8 54 +.b8 110 +.b8 51 +.b8 50 +.b8 102 +.b8 110 +.b8 114 +.b8 105 +.b8 106 +.b8 106 +.b8 112 +.b8 108 +.b8 51 +.b8 118 +.b8 118 +.b8 52 +.b8 99 +.b8 102 +.b8 113 +.b8 100 +.b8 52 +.b8 109 +.b8 122 +.b8 110 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 98 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 101 +.b8 103 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 119 +.b8 105 +.b8 116 +.b8 104 +.b8 95 +.b8 115 +.b8 105 +.b8 122 +.b8 101 +.b8 115 +.b8 95 +.b8 115 +.b8 116 +.b8 97 +.b8 99 +.b8 107 +.b8 95 +.b8 117 +.b8 110 +.b8 98 +.b8 105 +.b8 110 +.b8 100 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x151:0x7a DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 4 // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..10fae842eec251858f3c2a3d2e8882ca13ea4e71 --- /dev/null +++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source @@ -0,0 +1,972 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc215 = loc(unknown) +#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc222 = loc("in_out_ptr0"(#loc)) +#loc223 = loc("in_out_ptr1"(#loc)) +#loc224 = loc("in_ptr0"(#loc)) +#loc225 = loc("in_ptr1"(#loc)) +#loc226 = loc("in_ptr2"(#loc)) +#loc227 = loc("in_ptr3"(#loc)) +#loc228 = loc("in_ptr4"(#loc)) +#loc229 = loc("xnumel"(#loc)) +#loc230 = loc("r0_numel"(#loc)) +#loc432 = loc("input"(#loc213)) +#loc433 = loc("a"(#loc218)) +#loc434 = loc("b"(#loc218)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 73728 : i32 loc(#loc231) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc232) + %xoffset = tt.get_program_id x : i32 loc(#loc233) + %xoffset_2 = arith.constant 4 : i32 loc(#loc234) + %xoffset_3 = arith.constant 4 : i32 loc(#loc234) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc235) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc236) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc237) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc237) + %xmask = arith.constant true loc(#loc238) + %xmask_8 = arith.constant dense : tensor<4x128xi1> loc(#loc238) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc239) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc240) + %x0 = arith.constant 32 : i32 loc(#loc241) + %x0_10 = arith.constant 32 : i32 loc(#loc241) + %x0_11 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc241) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<4x1xi32> loc(#loc241) + %x1 = arith.constant 32 : i32 loc(#loc242) + %x1_13 = arith.constant 32 : i32 loc(#loc242) + %x1_14 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc242) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<4x1xi32> loc(#loc242) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc243) + %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244) + %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc244) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c128_i32 = arith.constant 128 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<4x128xf32>, tensor<4x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc246) + %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc246) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc247) + %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x128xi32> loc(#loc247) + %tmp0 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_27 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_28 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc248) + %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x128xi32> loc(#loc248) + %tmp0_30 = arith.constant 128 : i32 loc(#loc249) + %tmp0_31 = arith.constant 128 : i32 loc(#loc249) + %tmp0_32 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc249) + %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<4x1xi32> loc(#loc249) + %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc250) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc250) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<4x128xi32> loc(#loc250) + %tmp0_37 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_38 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_39 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc251) + %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<4x1xi32> loc(#loc251) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc252) + %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<4x128xi32> loc(#loc252) + %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc253) + %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc253) + %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254) + %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc254) + %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc254) + %tmp0_48 = arith.truncf %tmp0_47 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc254) + %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc254) + %tmp0_50 = arith.extf %tmp0_49 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc255) + %tmp6 = arith.constant 128 : i32 loc(#loc256) + %tmp6_51 = arith.constant 128 : i32 loc(#loc256) + %tmp6_52 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc256) + %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<4x1xi32> loc(#loc256) + %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc257) + %tmp6_55 = tt.broadcast %tmp6_53 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc257) + %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<4x128xi32> loc(#loc257) + %tmp6_57 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_58 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_59 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc258) + %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<4x1xi32> loc(#loc258) + %tmp6_61 = tt.broadcast %tmp6_60 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc259) + %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<4x128xi32> loc(#loc259) + %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc260) + %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc260) + %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261) + %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc261) + %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc261) + %tmp6_68 = arith.truncf %tmp6_67 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc261) + %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc261) + %tmp6_70 = arith.extf %tmp6_69 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc262) + %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<4x128xf32> loc(#loc263) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<4x128xf32> loc(#loc264) + %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc265) + %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc265) + %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<4x128xf32> loc(#loc266) + %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<4x128xf32> loc(#loc267) + %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc268) + %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc268) + scf.yield %_tmp4_72, %_tmp10_74 : tensor<4x128xf32>, tensor<4x128xf32> loc(#loc39) + } loc(#loc435) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc269) + %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc270) + %tmp10 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc271) + %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc272) + %c0_i32_21 = arith.constant 0 : i32 loc(#loc44) + %c128_i32_22 = arith.constant 128 : i32 loc(#loc44) + %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44) + %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44) + %6 = arith.bitcast %c128_i32_22 : i32 to i32 loc(#loc44) + %7 = ub.poison : i32 loc(#loc44) + scf.for %r0_offset = %4 to %5 step %6 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc273) + %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc273) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc274) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x128xi32> loc(#loc274) + %r0_3 = arith.constant 2 : i32 loc(#loc275) + %r0_3_25 = arith.constant 2 : i32 loc(#loc275) + %r0_3_26 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc275) + %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x128xi32> loc(#loc275) + %r0_4 = arith.constant 2 : i32 loc(#loc276) + %r0_4_28 = arith.constant 2 : i32 loc(#loc276) + %r0_4_29 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc276) + %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x128xi32> loc(#loc276) + %tmp50 = arith.constant 128 : i32 loc(#loc277) + %tmp50_31 = arith.constant 128 : i32 loc(#loc277) + %tmp50_32 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc277) + %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<4x1xi32> loc(#loc277) + %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc278) + %tmp50_35 = tt.broadcast %tmp50_33 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc278) + %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<4x128xi32> loc(#loc278) + %tmp50_37 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_38 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_39 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc279) + %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<4x1xi32> loc(#loc279) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc280) + %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<4x128xi32> loc(#loc280) + %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc281) + %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc281) + %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282) + %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc282) + %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc282) + %tmp50_48 = arith.truncf %tmp50_47 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc282) + %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc282) + %tmp50_50 = arith.extf %tmp50_49 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc283) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc284) + %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc284) + %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285) + %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc285) + %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc285) + %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc285) + %tmp58_56 = arith.extf %tmp58_55 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc286) + %tmp63 = arith.constant 128 : i32 loc(#loc287) + %tmp63_57 = arith.constant 128 : i32 loc(#loc287) + %tmp63_58 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc287) + %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<4x1xi32> loc(#loc287) + %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc288) + %tmp63_61 = tt.broadcast %tmp63_59 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc288) + %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<4x128xi32> loc(#loc288) + %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc289) + %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc289) + %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290) + %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc290) + %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc290) + %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc290) + %tmp66 = arith.constant 128 : i32 loc(#loc291) + %tmp66_69 = arith.constant 128 : i32 loc(#loc291) + %tmp66_70 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc291) + %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<4x1xi32> loc(#loc291) + %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc292) + %tmp66_73 = tt.broadcast %tmp66_71 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc292) + %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<4x128xi32> loc(#loc292) + %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc293) + %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc293) + %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294) + %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc294) + %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc294) + %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc294) + %tmp96 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_81 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_82 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc295) + %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x128xi32> loc(#loc295) + %tmp96_84 = arith.constant 128 : i32 loc(#loc296) + %tmp96_85 = arith.constant 128 : i32 loc(#loc296) + %tmp96_86 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc296) + %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<4x1xi32> loc(#loc296) + %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc297) + %tmp96_89 = tt.broadcast %tmp96_87 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc297) + %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<4x128xi32> loc(#loc297) + %tmp96_91 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_92 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_93 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc298) + %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<4x1xi32> loc(#loc298) + %tmp96_95 = tt.broadcast %tmp96_94 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc299) + %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<4x128xi32> loc(#loc299) + %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc300) + %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc300) + %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301) + %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc301) + %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc301) + %tmp96_102 = arith.truncf %tmp96_101 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc301) + %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc301) + %tmp96_104 = arith.extf %tmp96_103 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc302) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc303) + %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc303) + %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304) + %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc304) + %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc304) + %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc304) + %tmp102_110 = arith.extf %tmp102_109 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc305) + %tmp13 = arith.constant 0 : i64 loc(#loc306) + %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306) + %tmp14 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc307) + %tmp14_112 = arith.constant dense<0> : tensor<1x128xi64> loc(#loc307) + %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x128xi64> loc(#loc307) + %tmp15 = arith.constant 1 : i64 loc(#loc308) + %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308) + %tmp16 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc309) + %tmp16_115 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc309) + %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x128xi64> loc(#loc309) + %tmp17 = arith.constant 2 : i32 loc(#loc310) + %tmp17_117 = arith.constant 2 : i32 loc(#loc310) + %tmp17_118 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc310) + %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x128xi32> loc(#loc310) + %tmp17_120 = arith.constant 1 : i32 loc(#loc311) + %tmp17_121 = arith.constant 1 : i32 loc(#loc311) + %tmp17_122 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc311) + %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x128xi32> loc(#loc311) + %tmp17_124 = arith.constant 128 : i32 loc(#loc312) + %tmp17_125 = arith.constant 128 : i32 loc(#loc312) + %tmp17_126 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc312) + %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<4x1xi32> loc(#loc312) + %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc313) + %tmp17_129 = tt.broadcast %tmp17_127 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc313) + %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<4x128xi32> loc(#loc313) + %tmp17_131 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_132 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_133 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc314) + %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<4x1xi32> loc(#loc314) + %tmp17_135 = tt.broadcast %tmp17_134 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc315) + %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<4x128xi32> loc(#loc315) + %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc316) + %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc316) + %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc317) + %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318) + %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc318) + %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc318) + %tmp17_143 = arith.truncf %tmp17_142 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc318) + %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc318) + %tmp17_145 = arith.extf %tmp17_144 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc319) + %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320) + %tmp20 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc321) + %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<4x1xf32> loc(#loc321) + %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322) + %tmp22 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc323) + %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<4x1xf32> loc(#loc323) + %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc324) + %tmp24 = tt.broadcast %tmp23 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc325) + %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<4x128xf32> loc(#loc325) + %tmp25 = arith.constant 2 : i32 loc(#loc326) + %tmp25_149 = arith.constant 2 : i32 loc(#loc326) + %tmp25_150 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc326) + %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x128xi32> loc(#loc326) + %tmp25_152 = arith.constant 1 : i32 loc(#loc327) + %tmp25_153 = arith.constant 1 : i32 loc(#loc327) + %tmp25_154 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc327) + %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x128xi32> loc(#loc327) + %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc328) + %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc329) + %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc329) + %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc330) + %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331) + %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc331) + %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc331) + %tmp25_163 = arith.truncf %tmp25_162 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc331) + %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc331) + %tmp25_165 = arith.extf %tmp25_164 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc332) + %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<4x128xf32> loc(#loc333) + %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334) + %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc334) + %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<4x128xf32> loc(#loc334) + %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335) + %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc335) + %tmp31 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc336) + %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc336) + %tmp32 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc337) + %tmp32_170 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc337) + %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x128xi64> loc(#loc337) + %tmp33 = arith.constant 2 : i64 loc(#loc338) + %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338) + %tmp34 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc339) + %tmp34_173 = arith.constant dense<2> : tensor<1x128xi64> loc(#loc339) + %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x128xi64> loc(#loc339) + %tmp35 = arith.constant 2 : i32 loc(#loc340) + %tmp35_175 = arith.constant 2 : i32 loc(#loc340) + %tmp35_176 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc340) + %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x128xi32> loc(#loc340) + %tmp35_178 = arith.constant 128 : i32 loc(#loc341) + %tmp35_179 = arith.constant 128 : i32 loc(#loc341) + %tmp35_180 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc341) + %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<4x1xi32> loc(#loc341) + %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc342) + %tmp35_183 = tt.broadcast %tmp35_181 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc342) + %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<4x128xi32> loc(#loc342) + %tmp35_185 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_186 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_187 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc343) + %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<4x1xi32> loc(#loc343) + %tmp35_189 = tt.broadcast %tmp35_188 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc344) + %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<4x128xi32> loc(#loc344) + %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc345) + %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc345) + %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc346) + %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347) + %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc347) + %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc347) + %tmp35_197 = arith.truncf %tmp35_196 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc347) + %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc347) + %tmp35_199 = arith.extf %tmp35_198 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc348) + %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349) + %tmp38 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc350) + %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<4x1xf32> loc(#loc350) + %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351) + %tmp40 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc352) + %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<4x1xf32> loc(#loc352) + %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc353) + %tmp42 = tt.broadcast %tmp41 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc354) + %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<4x128xf32> loc(#loc354) + %tmp43 = arith.constant 2 : i32 loc(#loc355) + %tmp43_203 = arith.constant 2 : i32 loc(#loc355) + %tmp43_204 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc355) + %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x128xi32> loc(#loc355) + %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc356) + %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc357) + %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc357) + %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc358) + %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359) + %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc359) + %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc359) + %tmp43_213 = arith.truncf %tmp43_212 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc359) + %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc359) + %tmp43_215 = arith.extf %tmp43_214 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc360) + %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<4x128xf32> loc(#loc361) + %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362) + %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc362) + %tmp48 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc363) + %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc363) + %tmp49 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc364) + %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc364) + %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365) + %tmp53 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc366) + %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<4x1xf32> loc(#loc366) + %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367) + %tmp55 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc368) + %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<4x1xf32> loc(#loc368) + %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc369) + %tmp57 = tt.broadcast %tmp56 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc370) + %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<4x128xf32> loc(#loc370) + %tmp60 = tt.broadcast %tmp58_56 : tensor<1x128xf32> -> tensor<4x128xf32> loc(#loc371) + %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<4x128xf32> loc(#loc371) + %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<4x128xf32> loc(#loc372) + %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<4x128xf32> loc(#loc373) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<4x128xf32> loc(#loc374) + %tmp70 = arith.constant 2 : i32 loc(#loc375) + %tmp70_223 = arith.constant 2 : i32 loc(#loc375) + %tmp70_224 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc375) + %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x128xi32> loc(#loc375) + %tmp70_226 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_227 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_228 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc376) + %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x128xi32> loc(#loc376) + %tmp70_230 = arith.constant 128 : i32 loc(#loc377) + %tmp70_231 = arith.constant 128 : i32 loc(#loc377) + %tmp70_232 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc377) + %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<4x1xi32> loc(#loc377) + %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc378) + %tmp70_235 = tt.broadcast %tmp70_233 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc378) + %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<4x128xi32> loc(#loc378) + %tmp70_237 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_238 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_239 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc379) + %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<4x1xi32> loc(#loc379) + %tmp70_241 = tt.broadcast %tmp70_240 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc380) + %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<4x128xi32> loc(#loc380) + %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc381) + %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc381) + %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc382) + %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383) + %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc383) + %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc383) + %tmp70_249 = arith.truncf %tmp70_248 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc383) + %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc383) + %tmp70_251 = arith.extf %tmp70_250 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc384) + %tmp72 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc385) + %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<4x1xf32> loc(#loc385) + %tmp73 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc386) + %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<4x1xf32> loc(#loc386) + %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc387) + %tmp75 = tt.broadcast %tmp74 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc388) + %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<4x128xf32> loc(#loc388) + %tmp76 = arith.constant 2 : i32 loc(#loc389) + %tmp76_255 = arith.constant 2 : i32 loc(#loc389) + %tmp76_256 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc389) + %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x128xi32> loc(#loc389) + %tmp76_258 = arith.constant 1 : i32 loc(#loc390) + %tmp76_259 = arith.constant 1 : i32 loc(#loc390) + %tmp76_260 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc390) + %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x128xi32> loc(#loc390) + %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc391) + %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc392) + %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc392) + %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc393) + %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394) + %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc394) + %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc394) + %tmp76_269 = arith.truncf %tmp76_268 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc394) + %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc394) + %tmp76_271 = arith.extf %tmp76_270 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc395) + %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<4x128xf32> loc(#loc396) + %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397) + %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc397) + %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<4x128xf32> loc(#loc397) + %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398) + %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc398) + %tmp82 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc399) + %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc399) + %tmp83 = arith.constant 2 : i32 loc(#loc400) + %tmp83_276 = arith.constant 2 : i32 loc(#loc400) + %tmp83_277 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc400) + %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x128xi32> loc(#loc400) + %tmp83_279 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_280 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_281 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc401) + %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x128xi32> loc(#loc401) + %tmp83_283 = arith.constant 128 : i32 loc(#loc402) + %tmp83_284 = arith.constant 128 : i32 loc(#loc402) + %tmp83_285 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc402) + %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<4x1xi32> loc(#loc402) + %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc403) + %tmp83_288 = tt.broadcast %tmp83_286 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc403) + %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<4x128xi32> loc(#loc403) + %tmp83_290 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_291 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_292 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc404) + %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<4x1xi32> loc(#loc404) + %tmp83_294 = tt.broadcast %tmp83_293 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc405) + %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<4x128xi32> loc(#loc405) + %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc406) + %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc406) + %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc407) + %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408) + %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc408) + %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc408) + %tmp83_302 = arith.truncf %tmp83_301 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc408) + %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc408) + %tmp83_304 = arith.extf %tmp83_303 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc409) + %tmp85 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc410) + %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<4x1xf32> loc(#loc410) + %tmp86 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc411) + %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<4x1xf32> loc(#loc411) + %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc412) + %tmp88 = tt.broadcast %tmp87 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc413) + %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<4x128xf32> loc(#loc413) + %tmp89 = arith.constant 2 : i32 loc(#loc414) + %tmp89_308 = arith.constant 2 : i32 loc(#loc414) + %tmp89_309 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc414) + %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x128xi32> loc(#loc414) + %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc415) + %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc416) + %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc416) + %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc417) + %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418) + %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc418) + %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc418) + %tmp89_318 = arith.truncf %tmp89_317 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc418) + %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc418) + %tmp89_320 = arith.extf %tmp89_319 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc419) + %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<4x128xf32> loc(#loc420) + %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421) + %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc421) + %tmp94 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc422) + %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc422) + %tmp95 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc423) + %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc423) + %tmp98 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc424) + %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<4x1xf32> loc(#loc424) + %tmp99 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc425) + %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<4x1xf32> loc(#loc425) + %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc426) + %tmp101 = tt.broadcast %tmp100 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc427) + %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<4x128xf32> loc(#loc427) + %tmp104 = tt.broadcast %tmp102_110 : tensor<1x128xf32> -> tensor<4x128xf32> loc(#loc428) + %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<4x128xf32> loc(#loc428) + %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<4x128xf32> loc(#loc429) + %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<4x128xf32> loc(#loc430) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<4x128xf32> loc(#loc431) + %c128_i32_328 = arith.constant 128 : i32 loc(#loc204) + %c128_i32_329 = arith.constant 128 : i32 loc(#loc204) + %cst = arith.constant dense<128> : tensor<4x1xi32> loc(#loc204) + %8 = arith.muli %cst, %xindex_7 : tensor<4x1xi32> loc(#loc204) + %9 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc205) + %10 = tt.broadcast %8 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc205) + %11 = arith.addi %9, %10 : tensor<4x128xi32> loc(#loc205) + %12 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc206) + %13 = tt.addptr %12, %11 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc206) + %14 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc207) + %15 = arith.truncf %tmp68 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc207) + tt.store %13, %15, %14 : tensor<4x128x!tt.ptr> loc(#loc207) + %c128_i32_330 = arith.constant 128 : i32 loc(#loc208) + %c128_i32_331 = arith.constant 128 : i32 loc(#loc208) + %cst_332 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc208) + %16 = arith.muli %cst_332, %xindex_7 : tensor<4x1xi32> loc(#loc208) + %17 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc209) + %18 = tt.broadcast %16 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc209) + %19 = arith.addi %17, %18 : tensor<4x128xi32> loc(#loc209) + %20 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc210) + %21 = tt.addptr %20, %19 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc210) + %22 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc211) + %23 = arith.truncf %tmp110 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc211) + tt.store %21, %23, %22 : tensor<4x128x!tt.ptr> loc(#loc211) + } loc(#loc44) + tt.return loc(#loc212) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x128xf32> loc("input"(#loc213))) -> tensor<4xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214) + tt.reduce.return %2 : f32 loc(#loc214) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc214) + tt.return %0 : tensor<4xf32> loc(#loc216) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4xf32> loc(#loc217) + tt.return %1 : tensor<4xf32> loc(#loc217) + } loc(#loc213) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc219) + tt.return %0 : f32 loc(#loc220) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc221) + tt.return %1 : f32 loc(#loc221) + } loc(#loc218) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55) +#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66) +#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81) +#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57) +#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55) +#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63) +#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95) +#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42) +#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44) +#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55) +#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66) +#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81) +#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24) +#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24) +#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32) +#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53) +#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59) +#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91) +#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42) +#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24) +#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24) +#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33) +#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43) +#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39) +#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc231 = loc("xnumel"(#loc1)) +#loc232 = loc("r0_numel"(#loc2)) +#loc233 = loc("xoffset"(#loc3)) +#loc234 = loc("xoffset"(#loc4)) +#loc235 = loc("xindex"(#loc5)) +#loc236 = loc("xindex"(#loc6)) +#loc237 = loc("xindex"(#loc7)) +#loc238 = loc("xmask"(#loc8)) +#loc239 = loc("r0_base"(#loc9)) +#loc240 = loc("r0_base"(#loc10)) +#loc241 = loc("x0"(#loc11)) +#loc242 = loc("x1"(#loc12)) +#loc243 = loc("_tmp4"(#loc13)) +#loc244 = loc("_tmp10"(#loc14)) +#loc245 = loc("_tmp4"(#loc15)) +#loc246 = loc("r0_index"(#loc16)) +#loc247 = loc("r0_mask"(#loc17)) +#loc248 = loc("tmp0"(#loc18)) +#loc249 = loc("tmp0"(#loc19)) +#loc250 = loc("tmp0"(#loc20)) +#loc251 = loc("tmp0"(#loc21)) +#loc252 = loc("tmp0"(#loc22)) +#loc253 = loc("tmp0"(#loc23)) +#loc254 = loc("tmp0"(#loc24)) +#loc255 = loc("tmp0"(#loc25)) +#loc256 = loc("tmp6"(#loc26)) +#loc257 = loc("tmp6"(#loc27)) +#loc258 = loc("tmp6"(#loc28)) +#loc259 = loc("tmp6"(#loc29)) +#loc260 = loc("tmp6"(#loc30)) +#loc261 = loc("tmp6"(#loc31)) +#loc262 = loc("tmp6"(#loc32)) +#loc263 = loc("tmp2"(#loc33)) +#loc264 = loc("tmp5"(#loc34)) +#loc265 = loc("_tmp4"(#loc35)) +#loc266 = loc("tmp8"(#loc36)) +#loc267 = loc("tmp11"(#loc37)) +#loc268 = loc("_tmp10"(#loc38)) +#loc269 = loc("tmp4"(#loc40)) +#loc270 = loc("tmp4"(#loc41)) +#loc271 = loc("tmp10"(#loc42)) +#loc272 = loc("tmp10"(#loc43)) +#loc273 = loc("r0_index"(#loc45)) +#loc274 = loc("r0_mask"(#loc46)) +#loc275 = loc("r0_3"(#loc47)) +#loc276 = loc("r0_4"(#loc48)) +#loc277 = loc("tmp50"(#loc49)) +#loc278 = loc("tmp50"(#loc50)) +#loc279 = loc("tmp50"(#loc51)) +#loc280 = loc("tmp50"(#loc52)) +#loc281 = loc("tmp50"(#loc53)) +#loc282 = loc("tmp50"(#loc54)) +#loc283 = loc("tmp50"(#loc55)) +#loc284 = loc("tmp58"(#loc56)) +#loc285 = loc("tmp58"(#loc57)) +#loc286 = loc("tmp58"(#loc58)) +#loc287 = loc("tmp63"(#loc59)) +#loc288 = loc("tmp63"(#loc60)) +#loc289 = loc("tmp63"(#loc61)) +#loc290 = loc("tmp63"(#loc62)) +#loc291 = loc("tmp66"(#loc63)) +#loc292 = loc("tmp66"(#loc64)) +#loc293 = loc("tmp66"(#loc65)) +#loc294 = loc("tmp66"(#loc66)) +#loc295 = loc("tmp96"(#loc67)) +#loc296 = loc("tmp96"(#loc68)) +#loc297 = loc("tmp96"(#loc69)) +#loc298 = loc("tmp96"(#loc70)) +#loc299 = loc("tmp96"(#loc71)) +#loc300 = loc("tmp96"(#loc72)) +#loc301 = loc("tmp96"(#loc73)) +#loc302 = loc("tmp96"(#loc74)) +#loc303 = loc("tmp102"(#loc75)) +#loc304 = loc("tmp102"(#loc76)) +#loc305 = loc("tmp102"(#loc77)) +#loc306 = loc("tmp13"(#loc78)) +#loc307 = loc("tmp14"(#loc79)) +#loc308 = loc("tmp15"(#loc80)) +#loc309 = loc("tmp16"(#loc81)) +#loc310 = loc("tmp17"(#loc82)) +#loc311 = loc("tmp17"(#loc83)) +#loc312 = loc("tmp17"(#loc84)) +#loc313 = loc("tmp17"(#loc85)) +#loc314 = loc("tmp17"(#loc86)) +#loc315 = loc("tmp17"(#loc87)) +#loc316 = loc("tmp17"(#loc88)) +#loc317 = loc("tmp17"(#loc89)) +#loc318 = loc("tmp17"(#loc90)) +#loc319 = loc("tmp17"(#loc91)) +#loc320 = loc("tmp19"(#loc92)) +#loc321 = loc("tmp20"(#loc93)) +#loc322 = loc("tmp21"(#loc94)) +#loc323 = loc("tmp22"(#loc95)) +#loc324 = loc("tmp23"(#loc96)) +#loc325 = loc("tmp24"(#loc97)) +#loc326 = loc("tmp25"(#loc98)) +#loc327 = loc("tmp25"(#loc99)) +#loc328 = loc("tmp25"(#loc100)) +#loc329 = loc("tmp25"(#loc101)) +#loc330 = loc("tmp25"(#loc102)) +#loc331 = loc("tmp25"(#loc103)) +#loc332 = loc("tmp25"(#loc104)) +#loc333 = loc("tmp27"(#loc105)) +#loc334 = loc("tmp29"(#loc106)) +#loc335 = loc("tmp30"(#loc107)) +#loc336 = loc("tmp31"(#loc108)) +#loc337 = loc("tmp32"(#loc109)) +#loc338 = loc("tmp33"(#loc110)) +#loc339 = loc("tmp34"(#loc111)) +#loc340 = loc("tmp35"(#loc112)) +#loc341 = loc("tmp35"(#loc113)) +#loc342 = loc("tmp35"(#loc114)) +#loc343 = loc("tmp35"(#loc115)) +#loc344 = loc("tmp35"(#loc116)) +#loc345 = loc("tmp35"(#loc117)) +#loc346 = loc("tmp35"(#loc118)) +#loc347 = loc("tmp35"(#loc119)) +#loc348 = loc("tmp35"(#loc120)) +#loc349 = loc("tmp37"(#loc121)) +#loc350 = loc("tmp38"(#loc122)) +#loc351 = loc("tmp39"(#loc123)) +#loc352 = loc("tmp40"(#loc124)) +#loc353 = loc("tmp41"(#loc125)) +#loc354 = loc("tmp42"(#loc126)) +#loc355 = loc("tmp43"(#loc127)) +#loc356 = loc("tmp43"(#loc128)) +#loc357 = loc("tmp43"(#loc129)) +#loc358 = loc("tmp43"(#loc130)) +#loc359 = loc("tmp43"(#loc131)) +#loc360 = loc("tmp43"(#loc132)) +#loc361 = loc("tmp45"(#loc133)) +#loc362 = loc("tmp47"(#loc134)) +#loc363 = loc("tmp48"(#loc135)) +#loc364 = loc("tmp49"(#loc136)) +#loc365 = loc("tmp52"(#loc137)) +#loc366 = loc("tmp53"(#loc138)) +#loc367 = loc("tmp54"(#loc139)) +#loc368 = loc("tmp55"(#loc140)) +#loc369 = loc("tmp56"(#loc141)) +#loc370 = loc("tmp57"(#loc142)) +#loc371 = loc("tmp60"(#loc143)) +#loc372 = loc("tmp64"(#loc144)) +#loc373 = loc("tmp67"(#loc145)) +#loc374 = loc("tmp68"(#loc146)) +#loc375 = loc("tmp70"(#loc147)) +#loc376 = loc("tmp70"(#loc148)) +#loc377 = loc("tmp70"(#loc149)) +#loc378 = loc("tmp70"(#loc150)) +#loc379 = loc("tmp70"(#loc151)) +#loc380 = loc("tmp70"(#loc152)) +#loc381 = loc("tmp70"(#loc153)) +#loc382 = loc("tmp70"(#loc154)) +#loc383 = loc("tmp70"(#loc155)) +#loc384 = loc("tmp70"(#loc156)) +#loc385 = loc("tmp72"(#loc157)) +#loc386 = loc("tmp73"(#loc158)) +#loc387 = loc("tmp74"(#loc159)) +#loc388 = loc("tmp75"(#loc160)) +#loc389 = loc("tmp76"(#loc161)) +#loc390 = loc("tmp76"(#loc162)) +#loc391 = loc("tmp76"(#loc163)) +#loc392 = loc("tmp76"(#loc164)) +#loc393 = loc("tmp76"(#loc165)) +#loc394 = loc("tmp76"(#loc166)) +#loc395 = loc("tmp76"(#loc167)) +#loc396 = loc("tmp78"(#loc168)) +#loc397 = loc("tmp80"(#loc169)) +#loc398 = loc("tmp81"(#loc170)) +#loc399 = loc("tmp82"(#loc171)) +#loc400 = loc("tmp83"(#loc172)) +#loc401 = loc("tmp83"(#loc173)) +#loc402 = loc("tmp83"(#loc174)) +#loc403 = loc("tmp83"(#loc175)) +#loc404 = loc("tmp83"(#loc176)) +#loc405 = loc("tmp83"(#loc177)) +#loc406 = loc("tmp83"(#loc178)) +#loc407 = loc("tmp83"(#loc179)) +#loc408 = loc("tmp83"(#loc180)) +#loc409 = loc("tmp83"(#loc181)) +#loc410 = loc("tmp85"(#loc182)) +#loc411 = loc("tmp86"(#loc183)) +#loc412 = loc("tmp87"(#loc184)) +#loc413 = loc("tmp88"(#loc185)) +#loc414 = loc("tmp89"(#loc186)) +#loc415 = loc("tmp89"(#loc187)) +#loc416 = loc("tmp89"(#loc188)) +#loc417 = loc("tmp89"(#loc189)) +#loc418 = loc("tmp89"(#loc190)) +#loc419 = loc("tmp89"(#loc191)) +#loc420 = loc("tmp91"(#loc192)) +#loc421 = loc("tmp93"(#loc193)) +#loc422 = loc("tmp94"(#loc194)) +#loc423 = loc("tmp95"(#loc195)) +#loc424 = loc("tmp98"(#loc196)) +#loc425 = loc("tmp99"(#loc197)) +#loc426 = loc("tmp100"(#loc198)) +#loc427 = loc("tmp101"(#loc199)) +#loc428 = loc("tmp104"(#loc200)) +#loc429 = loc("tmp107"(#loc201)) +#loc430 = loc("tmp109"(#loc202)) +#loc431 = loc("tmp110"(#loc203)) +#loc435 = loc("_tmp10"(#loc245)) diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2cefa8cb793bb6248b8e1d562ebf076b666bfce3 --- /dev/null +++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir @@ -0,0 +1,495 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc130 = loc("in_out_ptr0"(#loc)) +#loc131 = loc("in_out_ptr1"(#loc)) +#loc132 = loc("in_ptr0"(#loc)) +#loc133 = loc("in_ptr1"(#loc)) +#loc134 = loc("in_ptr2"(#loc)) +#loc135 = loc("in_ptr3"(#loc)) +#loc136 = loc("in_ptr4"(#loc)) +#loc137 = loc("xnumel"(#loc)) +#loc138 = loc("r0_numel"(#loc)) +#loc166 = loc("tmp4"(#loc30)) +#loc168 = loc("tmp10"(#loc33)) +#loc259 = loc(callsite(#loc1 at #loc166)) +#loc261 = loc(callsite(#loc1 at #loc168)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4097> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x128xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<36864> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<36864> : tensor<4x1xi32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<128> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<4x1xi32, #blocked1> loc(#loc1) + %cst_7 = arith.constant dense<4096> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<4096> : tensor<1x128xi32, #blocked1> loc(#loc1) + %cst_9 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x128xi32, #blocked1> loc(#loc1) + %cst_11 = arith.constant dense<32> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<4x1xi32, #blocked1> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_13 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked1> loc(#loc1) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked> loc(#loc1) + %cst_15 = arith.constant dense<128> : tensor<1x128xi32, #blocked2> loc(#loc1) + %cst_16 = arith.constant dense<0.000000e+00> : tensor<1x128xbf16, #blocked2> loc(#loc1) + %cst_17 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32, #blocked1> loc(#loc1) + %cst_18 = arith.constant dense<1.280000e+02> : tensor<4x1xf32, #blocked1> loc(#loc1) + %cst_19 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked> loc(#loc1) + %cst_20 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc139) + %xoffset_21 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc140) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc141) + %xindex_22 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc141) + %xindex_23 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc141) + %xindex_24 = tt.expand_dims %xindex_22 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc141) + %xindex_25 = tt.splat %xoffset_21 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc142) + %xindex_26 = tt.splat %xoffset_21 : i32 -> tensor<4x1xi32, #blocked> loc(#loc142) + %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<4x1xi32, #blocked1> loc(#loc142) + %xindex_28 = arith.addi %xindex_26, %xindex_24 : tensor<4x1xi32, #blocked> loc(#loc142) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc143) + %r0_base_29 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc143) + %r0_base_30 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> loc(#loc143) + %r0_base_31 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc143) + %r0_base_32 = tt.expand_dims %r0_base_29 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc143) + %r0_base_33 = tt.expand_dims %r0_base_30 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x128xi32, #blocked2> loc(#loc143) + %x0 = arith.remsi %xindex_27, %cst_12 : tensor<4x1xi32, #blocked1> loc(#loc144) + %x0_34 = arith.remsi %xindex_28, %cst_11 : tensor<4x1xi32, #blocked> loc(#loc144) + %x1 = arith.divsi %xindex_27, %cst_12 : tensor<4x1xi32, #blocked1> loc(#loc145) + %x1_35 = arith.divsi %xindex_28, %cst_11 : tensor<4x1xi32, #blocked> loc(#loc145) + %r0_mask = arith.cmpi slt, %r0_base_31, %cst_10 : tensor<1x128xi32, #blocked1> loc(#loc146) + %r0_mask_36 = arith.cmpi slt, %r0_base_32, %cst_9 : tensor<1x128xi32, #blocked> loc(#loc146) + %r0_mask_37 = arith.cmpi slt, %r0_base_33, %cst_15 : tensor<1x128xi32, #blocked2> loc(#loc146) + %tmp0 = arith.addi %r0_base_31, %cst_8 : tensor<1x128xi32, #blocked1> loc(#loc147) + %tmp0_38 = arith.muli %x0, %cst_6 : tensor<4x1xi32, #blocked1> loc(#loc148) + %tmp0_39 = arith.muli %x0_34, %cst_5 : tensor<4x1xi32, #blocked> loc(#loc148) + %tmp0_40 = tt.broadcast %tmp0 : tensor<1x128xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc149) + %tmp0_41 = tt.broadcast %tmp0_38 : tensor<4x1xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc149) + %tmp0_42 = tt.broadcast %tmp0_39 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc149) + %tmp0_43 = arith.addi %tmp0_40, %tmp0_41 : tensor<4x128xi32, #blocked1> loc(#loc149) + %tmp0_44 = arith.muli %x1, %cst_4 : tensor<4x1xi32, #blocked1> loc(#loc150) + %tmp0_45 = arith.muli %x1_35, %cst_3 : tensor<4x1xi32, #blocked> loc(#loc150) + %tmp0_46 = tt.broadcast %tmp0_44 : tensor<4x1xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc151) + %tmp0_47 = tt.broadcast %tmp0_45 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc151) + %tmp0_48 = arith.addi %tmp0_43, %tmp0_46 : tensor<4x128xi32, #blocked1> loc(#loc151) + %tmp0_49 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked1> loc(#loc152) + %tmp0_50 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked> loc(#loc152) + %tmp0_51 = tt.addptr %tmp0_49, %tmp0_48 : tensor<4x128x!tt.ptr, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc152) + %tmp0_52 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked1> -> tensor<4x128xi1, #blocked1> loc(#loc153) + %tmp0_53 = tt.load %tmp0_51, %tmp0_52, %cst_13 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked1> loc(#loc153) + %tmp0_54 = arith.extf %tmp0_53 : tensor<4x128xbf16, #blocked1> to tensor<4x128xf32, #blocked1> loc(#loc154) + %tmp6 = tt.broadcast %r0_base_31 : tensor<1x128xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc155) + %tmp6_55 = arith.addi %tmp6, %tmp0_41 : tensor<4x128xi32, #blocked1> loc(#loc155) + %tmp6_56 = arith.addi %tmp6_55, %tmp0_46 : tensor<4x128xi32, #blocked1> loc(#loc156) + %tmp6_57 = tt.addptr %tmp0_49, %tmp6_56 : tensor<4x128x!tt.ptr, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc157) + %tmp6_58 = tt.load %tmp6_57, %tmp0_52, %cst_13 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked1> loc(#loc158) + %tmp6_59 = arith.extf %tmp6_58 : tensor<4x128xbf16, #blocked1> to tensor<4x128xf32, #blocked1> loc(#loc159) + %tmp2 = arith.mulf %tmp0_54, %tmp0_54 : tensor<4x128xf32, #blocked1> loc(#loc160) + %tmp5 = arith.addf %tmp2, %cst_20 : tensor<4x128xf32, #blocked1> loc(#loc161) + %_tmp4 = arith.select %tmp0_52, %tmp5, %cst_20 : tensor<4x128xi1, #blocked1>, tensor<4x128xf32, #blocked1> loc(#loc162) + %tmp8 = arith.mulf %tmp6_59, %tmp6_59 : tensor<4x128xf32, #blocked1> loc(#loc163) + %tmp11 = arith.addf %tmp8, %cst_20 : tensor<4x128xf32, #blocked1> loc(#loc164) + %_tmp10 = arith.select %tmp0_52, %tmp11, %cst_20 : tensor<4x128xi1, #blocked1>, tensor<4x128xf32, #blocked1> loc(#loc165) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_134: f32 loc(callsite(#loc1 at #loc166)), %tmp4_135: f32 loc(callsite(#loc1 at #loc166))): + %tmp4_136 = arith.addf %tmp4_134, %tmp4_135 : f32 loc(#loc264) + tt.reduce.return %tmp4_136 : f32 loc(#loc258) + }) : (tensor<4x128xf32, #blocked1>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc258) + %tmp4_60 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc167) + %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_134: f32 loc(callsite(#loc1 at #loc168)), %tmp10_135: f32 loc(callsite(#loc1 at #loc168))): + %tmp10_136 = arith.addf %tmp10_134, %tmp10_135 : f32 loc(#loc265) + tt.reduce.return %tmp10_136 : f32 loc(#loc260) + }) : (tensor<4x128xf32, #blocked1>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc260) + %tmp10_61 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc169) + %r0_3 = arith.remsi %r0_base_32, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc170) + %r0_4 = arith.divsi %r0_base_32, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc171) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc172) + %tmp58_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked2> loc(#loc172) + %tmp58_63 = tt.addptr %tmp58_62, %r0_base_33 : tensor<1x128x!tt.ptr, #blocked2>, tensor<1x128xi32, #blocked2> loc(#loc172) + %tmp58_64 = tt.load %tmp58_63, %r0_mask_37, %cst_16 evictionPolicy = evict_last : tensor<1x128x!tt.ptr, #blocked2> loc(#loc173) + %tmp58_65 = arith.extf %tmp58_64 : tensor<1x128xbf16, #blocked2> to tensor<1x128xf32, #blocked2> loc(#loc174) + %tmp63 = arith.muli %x1, %cst_6 : tensor<4x1xi32, #blocked1> loc(#loc175) + %tmp63_66 = tt.broadcast %tmp63 : tensor<4x1xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc176) + %tmp63_67 = arith.addi %tmp6, %tmp63_66 : tensor<4x128xi32, #blocked1> loc(#loc176) + %tmp63_68 = tt.splat %in_ptr2 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked1> loc(#loc177) + %tmp63_69 = tt.addptr %tmp63_68, %tmp63_67 : tensor<4x128x!tt.ptr, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc177) + %tmp63_70 = tt.load %tmp63_69, %tmp0_52, %cst_20 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked1> loc(#loc178) + %tmp63_71 = ttg.convert_layout %tmp63_70 : tensor<4x128xf32, #blocked1> -> tensor<4x128xf32, #blocked> loc(#loc178) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked1> loc(#loc179) + %tmp66_72 = tt.addptr %tmp66, %tmp63_67 : tensor<4x128x!tt.ptr, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc179) + %tmp66_73 = tt.load %tmp66_72, %tmp0_52, %cst_20 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked1> loc(#loc180) + %tmp66_74 = ttg.convert_layout %tmp66_73 : tensor<4x128xf32, #blocked1> -> tensor<4x128xf32, #blocked> loc(#loc180) + %tmp96 = tt.load %tmp0_51, %tmp0_52, %cst_13 evictionPolicy = evict_first : tensor<4x128x!tt.ptr, #blocked1> loc(#loc181) + %tmp96_75 = arith.extf %tmp96 : tensor<4x128xbf16, #blocked1> to tensor<4x128xf32, #blocked1> loc(#loc182) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc183) + %tmp102_76 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked2> loc(#loc183) + %tmp102_77 = tt.addptr %tmp102_76, %r0_base_33 : tensor<1x128x!tt.ptr, #blocked2>, tensor<1x128xi32, #blocked2> loc(#loc183) + %tmp102_78 = tt.load %tmp102_77, %r0_mask_37, %cst_16 evictionPolicy = evict_last : tensor<1x128x!tt.ptr, #blocked2> loc(#loc184) + %tmp102_79 = arith.extf %tmp102_78 : tensor<1x128xbf16, #blocked2> to tensor<1x128xf32, #blocked2> loc(#loc185) + %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked> loc(#loc186) + %tmp16_80 = arith.cmpi slt, %tmp16, %cst_1 : tensor<1x128xi64, #blocked> loc(#loc186) + %tmp17 = arith.muli %r0_4, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc187) + %tmp17_81 = arith.addi %tmp17, %cst_0 : tensor<1x128xi32, #blocked> loc(#loc188) + %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc189) + %tmp17_83 = arith.addi %tmp17_82, %tmp0_42 : tensor<4x128xi32, #blocked> loc(#loc189) + %tmp17_84 = arith.addi %tmp17_83, %tmp0_47 : tensor<4x128xi32, #blocked> loc(#loc190) + %tmp17_85 = tt.addptr %tmp0_50, %tmp17_84 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc191) + %tmp17_86 = arith.andi %r0_mask_36, %tmp16_80 : tensor<1x128xi1, #blocked> loc(#loc192) + %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc193) + %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked> loc(#loc193) + %tmp17_89 = arith.extf %tmp17_88 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc194) + %tmp20 = arith.divf %tmp10_61, %cst_18 : tensor<4x1xf32, #blocked1> loc(#loc195) + %tmp22 = arith.addf %tmp20, %cst_17 : tensor<4x1xf32, #blocked1> loc(#loc196) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32, #blocked1>) -> tensor<4x1xf32, #blocked1> loc(#loc197) + %tmp24 = ttg.convert_layout %tmp23 : tensor<4x1xf32, #blocked1> -> tensor<4x1xf32, #blocked> loc(#loc198) + %tmp24_90 = tt.broadcast %tmp24 : tensor<4x1xf32, #blocked> -> tensor<4x128xf32, #blocked> loc(#loc198) + %tmp24_91 = tt.broadcast %tmp23 : tensor<4x1xf32, #blocked1> -> tensor<4x128xf32, #blocked1> loc(#loc198) + %tmp24_92 = arith.mulf %tmp17_89, %tmp24_90 : tensor<4x128xf32, #blocked> loc(#loc198) + %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc199) + %tmp25_93 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr, #blocked> -> tensor<4x128x!tt.ptr, #blocked> loc(#loc199) + %tmp25_94 = tt.load %tmp25_93, %tmp17_87, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked> loc(#loc200) + %tmp25_95 = arith.extf %tmp25_94 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc201) + %tmp27 = arith.mulf %tmp24_92, %tmp25_95 : tensor<4x128xf32, #blocked> loc(#loc202) + %tmp29 = arith.subf %cst_19, %tmp27 : tensor<4x128xf32, #blocked> loc(#loc203) + %tmp31 = tt.broadcast %tmp16_80 : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc204) + %tmp32 = arith.cmpi sge, %tmp16, %cst_1 : tensor<1x128xi64, #blocked> loc(#loc205) + %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc206) + %tmp35_96 = arith.addi %tmp35, %tmp0_42 : tensor<4x128xi32, #blocked> loc(#loc206) + %tmp35_97 = arith.addi %tmp35_96, %tmp0_47 : tensor<4x128xi32, #blocked> loc(#loc207) + %tmp35_98 = tt.addptr %tmp0_50, %tmp35_97 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc208) + %tmp35_99 = arith.andi %r0_mask_36, %tmp32 : tensor<1x128xi1, #blocked> loc(#loc209) + %tmp35_100 = tt.broadcast %tmp35_99 : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc210) + %tmp35_101 = tt.load %tmp35_98, %tmp35_100, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked> loc(#loc210) + %tmp35_102 = arith.extf %tmp35_101 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc211) + %tmp42 = arith.mulf %tmp35_102, %tmp24_90 : tensor<4x128xf32, #blocked> loc(#loc212) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc213) + %tmp43_103 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr, #blocked> -> tensor<4x128x!tt.ptr, #blocked> loc(#loc213) + %tmp43_104 = tt.load %tmp43_103, %tmp35_100, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked> loc(#loc214) + %tmp43_105 = arith.extf %tmp43_104 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc215) + %tmp45 = arith.mulf %tmp42, %tmp43_105 : tensor<4x128xf32, #blocked> loc(#loc216) + %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc217) + %tmp48_106 = arith.select %tmp48, %tmp45, %cst_19 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc217) + %tmp49 = arith.select %tmp31, %tmp29, %tmp48_106 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc262) + %tmp57 = arith.mulf %tmp6_59, %tmp24_91 : tensor<4x128xf32, #blocked1> loc(#loc219) + %tmp60 = ttg.convert_layout %tmp58_65 : tensor<1x128xf32, #blocked2> -> tensor<1x128xf32, #blocked1> loc(#loc220) + %tmp60_107 = tt.broadcast %tmp60 : tensor<1x128xf32, #blocked1> -> tensor<4x128xf32, #blocked1> loc(#loc220) + %tmp60_108 = arith.mulf %tmp57, %tmp60_107 : tensor<4x128xf32, #blocked1> loc(#loc220) + %tmp64 = arith.mulf %tmp60_108, %tmp63_70 : tensor<4x128xf32, #blocked1> loc(#loc221) + %tmp64_109 = ttg.convert_layout %tmp64 : tensor<4x128xf32, #blocked1> -> tensor<4x128xf32, #blocked> loc(#loc221) + %tmp67 = arith.mulf %tmp49, %tmp66_74 : tensor<4x128xf32, #blocked> loc(#loc222) + %tmp68 = arith.addf %tmp64_109, %tmp67 : tensor<4x128xf32, #blocked> loc(#loc223) + %tmp70 = arith.addi %tmp17, %cst : tensor<1x128xi32, #blocked> loc(#loc224) + %tmp70_110 = tt.broadcast %tmp70 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc225) + %tmp70_111 = arith.addi %tmp70_110, %tmp0_42 : tensor<4x128xi32, #blocked> loc(#loc225) + %tmp70_112 = arith.addi %tmp70_111, %tmp0_47 : tensor<4x128xi32, #blocked> loc(#loc226) + %tmp70_113 = tt.addptr %tmp0_50, %tmp70_112 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc227) + %tmp70_114 = tt.load %tmp70_113, %tmp17_87, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked> loc(#loc228) + %tmp70_115 = arith.extf %tmp70_114 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc229) + %tmp72 = arith.divf %tmp4_60, %cst_18 : tensor<4x1xf32, #blocked1> loc(#loc230) + %tmp73 = arith.addf %tmp72, %cst_17 : tensor<4x1xf32, #blocked1> loc(#loc231) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32, #blocked1>) -> tensor<4x1xf32, #blocked1> loc(#loc232) + %tmp75 = ttg.convert_layout %tmp74 : tensor<4x1xf32, #blocked1> -> tensor<4x1xf32, #blocked> loc(#loc233) + %tmp75_116 = tt.broadcast %tmp75 : tensor<4x1xf32, #blocked> -> tensor<4x128xf32, #blocked> loc(#loc233) + %tmp75_117 = tt.broadcast %tmp74 : tensor<4x1xf32, #blocked1> -> tensor<4x128xf32, #blocked1> loc(#loc233) + %tmp75_118 = arith.mulf %tmp70_115, %tmp75_116 : tensor<4x128xf32, #blocked> loc(#loc233) + %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc234) + %tmp76_119 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr, #blocked> -> tensor<4x128x!tt.ptr, #blocked> loc(#loc234) + %tmp76_120 = tt.load %tmp76_119, %tmp17_87, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked> loc(#loc235) + %tmp76_121 = arith.extf %tmp76_120 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc236) + %tmp78 = arith.mulf %tmp75_118, %tmp76_121 : tensor<4x128xf32, #blocked> loc(#loc237) + %tmp80 = arith.subf %cst_19, %tmp78 : tensor<4x128xf32, #blocked> loc(#loc238) + %tmp83 = arith.addi %tmp17, %cst_7 : tensor<1x128xi32, #blocked> loc(#loc239) + %tmp83_122 = tt.broadcast %tmp83 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc240) + %tmp83_123 = arith.addi %tmp83_122, %tmp0_42 : tensor<4x128xi32, #blocked> loc(#loc240) + %tmp83_124 = arith.addi %tmp83_123, %tmp0_47 : tensor<4x128xi32, #blocked> loc(#loc241) + %tmp83_125 = tt.addptr %tmp0_50, %tmp83_124 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc242) + %tmp83_126 = tt.load %tmp83_125, %tmp35_100, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked> loc(#loc243) + %tmp83_127 = arith.extf %tmp83_126 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc244) + %tmp88 = arith.mulf %tmp83_127, %tmp75_116 : tensor<4x128xf32, #blocked> loc(#loc245) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc246) + %tmp89_128 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr, #blocked> -> tensor<4x128x!tt.ptr, #blocked> loc(#loc246) + %tmp89_129 = tt.load %tmp89_128, %tmp35_100, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr, #blocked> loc(#loc247) + %tmp89_130 = arith.extf %tmp89_129 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc248) + %tmp91 = arith.mulf %tmp88, %tmp89_130 : tensor<4x128xf32, #blocked> loc(#loc249) + %tmp94 = arith.select %tmp48, %tmp91, %cst_19 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc250) + %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc263) + %tmp101 = arith.mulf %tmp96_75, %tmp75_117 : tensor<4x128xf32, #blocked1> loc(#loc253) + %tmp101_131 = ttg.convert_layout %tmp101 : tensor<4x128xf32, #blocked1> -> tensor<4x128xf32, #blocked> loc(#loc253) + %tmp107 = ttg.convert_layout %tmp102_79 : tensor<1x128xf32, #blocked2> -> tensor<1x128xf32, #blocked> loc(#loc254) + %tmp104 = tt.broadcast %tmp107 : tensor<1x128xf32, #blocked> -> tensor<4x128xf32, #blocked> loc(#loc255) + %tmp104_132 = arith.mulf %tmp101_131, %tmp104 : tensor<4x128xf32, #blocked> loc(#loc255) + %tmp107_133 = arith.mulf %tmp104_132, %tmp63_71 : tensor<4x128xf32, #blocked> loc(#loc254) + %tmp109 = arith.mulf %tmp95, %tmp66_74 : tensor<4x128xf32, #blocked> loc(#loc256) + %tmp110 = arith.addf %tmp107_133, %tmp109 : tensor<4x128xf32, #blocked> loc(#loc257) + %0 = arith.muli %xindex_27, %cst_6 : tensor<4x1xi32, #blocked1> loc(#loc123) + %1 = tt.broadcast %0 : tensor<4x1xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc124) + %2 = arith.addi %tmp6, %1 : tensor<4x128xi32, #blocked1> loc(#loc124) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked1> loc(#loc125) + %4 = tt.addptr %3, %2 : tensor<4x128x!tt.ptr, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc125) + %5 = arith.truncf %tmp68 : tensor<4x128xf32, #blocked> to tensor<4x128xbf16, #blocked> loc(#loc126) + %6 = ttg.convert_layout %5 : tensor<4x128xbf16, #blocked> -> tensor<4x128xbf16, #blocked1> loc(#loc126) + tt.store %4, %6, %tmp0_52 : tensor<4x128x!tt.ptr, #blocked1> loc(#loc126) + %7 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked1> loc(#loc127) + %8 = tt.addptr %7, %2 : tensor<4x128x!tt.ptr, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc127) + %9 = arith.truncf %tmp110 : tensor<4x128xf32, #blocked> to tensor<4x128xbf16, #blocked> loc(#loc128) + %10 = ttg.convert_layout %9 : tensor<4x128xbf16, #blocked> -> tensor<4x128xbf16, #blocked1> loc(#loc128) + tt.store %8, %10, %tmp0_52 : tensor<4x128x!tt.ptr, #blocked1> loc(#loc128) + tt.return loc(#loc129) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc139 = loc("xoffset"(#loc2)) +#loc140 = loc("xoffset"(#loc3)) +#loc141 = loc("xindex"(#loc4)) +#loc142 = loc("xindex"(#loc5)) +#loc143 = loc("r0_base"(#loc6)) +#loc144 = loc("x0"(#loc7)) +#loc145 = loc("x1"(#loc8)) +#loc146 = loc("r0_mask"(#loc9)) +#loc147 = loc("tmp0"(#loc10)) +#loc148 = loc("tmp0"(#loc11)) +#loc149 = loc("tmp0"(#loc12)) +#loc150 = loc("tmp0"(#loc13)) +#loc151 = loc("tmp0"(#loc14)) +#loc152 = loc("tmp0"(#loc15)) +#loc153 = loc("tmp0"(#loc16)) +#loc154 = loc("tmp0"(#loc17)) +#loc155 = loc("tmp6"(#loc18)) +#loc156 = loc("tmp6"(#loc19)) +#loc157 = loc("tmp6"(#loc20)) +#loc158 = loc("tmp6"(#loc21)) +#loc159 = loc("tmp6"(#loc22)) +#loc160 = loc("tmp2"(#loc23)) +#loc161 = loc("tmp5"(#loc24)) +#loc162 = loc("_tmp4"(#loc25)) +#loc163 = loc("tmp8"(#loc26)) +#loc164 = loc("tmp11"(#loc27)) +#loc165 = loc("_tmp10"(#loc28)) +#loc167 = loc("tmp4"(#loc32)) +#loc169 = loc("tmp10"(#loc34)) +#loc170 = loc("r0_3"(#loc35)) +#loc171 = loc("r0_4"(#loc36)) +#loc172 = loc("tmp58"(#loc37)) +#loc173 = loc("tmp58"(#loc38)) +#loc174 = loc("tmp58"(#loc39)) +#loc175 = loc("tmp63"(#loc40)) +#loc176 = loc("tmp63"(#loc41)) +#loc177 = loc("tmp63"(#loc42)) +#loc178 = loc("tmp63"(#loc43)) +#loc179 = loc("tmp66"(#loc44)) +#loc180 = loc("tmp66"(#loc45)) +#loc181 = loc("tmp96"(#loc46)) +#loc182 = loc("tmp96"(#loc47)) +#loc183 = loc("tmp102"(#loc48)) +#loc184 = loc("tmp102"(#loc49)) +#loc185 = loc("tmp102"(#loc50)) +#loc186 = loc("tmp16"(#loc51)) +#loc187 = loc("tmp17"(#loc52)) +#loc188 = loc("tmp17"(#loc53)) +#loc189 = loc("tmp17"(#loc54)) +#loc190 = loc("tmp17"(#loc55)) +#loc191 = loc("tmp17"(#loc56)) +#loc192 = loc("tmp17"(#loc57)) +#loc193 = loc("tmp17"(#loc58)) +#loc194 = loc("tmp17"(#loc59)) +#loc195 = loc("tmp20"(#loc60)) +#loc196 = loc("tmp22"(#loc61)) +#loc197 = loc("tmp23"(#loc62)) +#loc198 = loc("tmp24"(#loc63)) +#loc199 = loc("tmp25"(#loc64)) +#loc200 = loc("tmp25"(#loc65)) +#loc201 = loc("tmp25"(#loc66)) +#loc202 = loc("tmp27"(#loc67)) +#loc203 = loc("tmp29"(#loc68)) +#loc204 = loc("tmp31"(#loc69)) +#loc205 = loc("tmp32"(#loc70)) +#loc206 = loc("tmp35"(#loc71)) +#loc207 = loc("tmp35"(#loc72)) +#loc208 = loc("tmp35"(#loc73)) +#loc209 = loc("tmp35"(#loc74)) +#loc210 = loc("tmp35"(#loc75)) +#loc211 = loc("tmp35"(#loc76)) +#loc212 = loc("tmp42"(#loc77)) +#loc213 = loc("tmp43"(#loc78)) +#loc214 = loc("tmp43"(#loc79)) +#loc215 = loc("tmp43"(#loc80)) +#loc216 = loc("tmp45"(#loc81)) +#loc217 = loc("tmp48"(#loc82)) +#loc218 = loc("tmp49"(#loc83)) +#loc219 = loc("tmp57"(#loc84)) +#loc220 = loc("tmp60"(#loc85)) +#loc221 = loc("tmp64"(#loc86)) +#loc222 = loc("tmp67"(#loc87)) +#loc223 = loc("tmp68"(#loc88)) +#loc224 = loc("tmp70"(#loc89)) +#loc225 = loc("tmp70"(#loc90)) +#loc226 = loc("tmp70"(#loc91)) +#loc227 = loc("tmp70"(#loc92)) +#loc228 = loc("tmp70"(#loc93)) +#loc229 = loc("tmp70"(#loc94)) +#loc230 = loc("tmp72"(#loc95)) +#loc231 = loc("tmp73"(#loc96)) +#loc232 = loc("tmp74"(#loc97)) +#loc233 = loc("tmp75"(#loc98)) +#loc234 = loc("tmp76"(#loc99)) +#loc235 = loc("tmp76"(#loc100)) +#loc236 = loc("tmp76"(#loc101)) +#loc237 = loc("tmp78"(#loc102)) +#loc238 = loc("tmp80"(#loc103)) +#loc239 = loc("tmp83"(#loc104)) +#loc240 = loc("tmp83"(#loc105)) +#loc241 = loc("tmp83"(#loc106)) +#loc242 = loc("tmp83"(#loc107)) +#loc243 = loc("tmp83"(#loc108)) +#loc244 = loc("tmp83"(#loc109)) +#loc245 = loc("tmp88"(#loc110)) +#loc246 = loc("tmp89"(#loc111)) +#loc247 = loc("tmp89"(#loc112)) +#loc248 = loc("tmp89"(#loc113)) +#loc249 = loc("tmp91"(#loc114)) +#loc250 = loc("tmp94"(#loc115)) +#loc251 = loc("tmp95"(#loc116)) +#loc252 = loc("tmp82"(#loc117)) +#loc253 = loc("tmp101"(#loc118)) +#loc254 = loc("tmp107"(#loc119)) +#loc255 = loc("tmp104"(#loc120)) +#loc256 = loc("tmp109"(#loc121)) +#loc257 = loc("tmp110"(#loc122)) +#loc258 = loc(callsite(#loc29 at #loc166)) +#loc260 = loc(callsite(#loc29 at #loc168)) +#loc262 = loc(fused[#loc218, #loc204]) +#loc263 = loc(fused[#loc251, #loc252]) +#loc264 = loc(callsite(#loc31 at #loc258)) +#loc265 = loc(callsite(#loc31 at #loc260)) diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..f12e63b8c007fc3a601413263d614941e61bf54b --- /dev/null +++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir @@ -0,0 +1,457 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc132 = loc("in_out_ptr0"(#loc)) +#loc133 = loc("in_out_ptr1"(#loc)) +#loc134 = loc("in_ptr0"(#loc)) +#loc135 = loc("in_ptr1"(#loc)) +#loc136 = loc("in_ptr2"(#loc)) +#loc137 = loc("in_ptr3"(#loc)) +#loc138 = loc("in_ptr4"(#loc)) +#loc139 = loc("xnumel"(#loc)) +#loc140 = loc("r0_numel"(#loc)) +#loc170 = loc("tmp4"(#loc32)) +#loc172 = loc("tmp10"(#loc35)) +#loc263 = loc(callsite(#loc1 at #loc170)) +#loc265 = loc(callsite(#loc1 at #loc172)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x128xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16> loc(#loc1) + %cst_1 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc1) + %cst_6 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc1) + %cst_7 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc141) + %xoffset_13 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc142) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc143) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc144) + %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<4x1xi32> loc(#loc145) + %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<4x1xi32> loc(#loc145) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc146) + %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc147) + %x0 = arith.remsi %xindex_16, %cst_12 : tensor<4x1xi32> loc(#loc148) + %x1 = arith.divsi %xindex_16, %cst_12 : tensor<4x1xi32> loc(#loc149) + %r0_mask = arith.cmpi slt, %r0_base_17, %cst_10 : tensor<1x128xi32> loc(#loc150) + %tmp0 = arith.addi %r0_base_17, %cst_9 : tensor<1x128xi32> loc(#loc151) + %tmp0_18 = arith.muli %x0, %cst_8 : tensor<4x1xi32> loc(#loc152) + %tmp0_19 = tt.broadcast %tmp0 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc153) + %tmp0_20 = tt.broadcast %tmp0_18 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc153) + %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<4x128xi32> loc(#loc153) + %tmp0_22 = arith.muli %x1, %cst_7 : tensor<4x1xi32> loc(#loc154) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc155) + %tmp0_24 = arith.addi %tmp0_21, %tmp0_23 : tensor<4x128xi32> loc(#loc155) + %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc156) + %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc156) + %tmp0_27 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc157) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc157) + %tmp0_29 = arith.extf %tmp0_28 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc158) + %tmp6 = tt.broadcast %r0_base_17 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc159) + %tmp6_30 = arith.addi %tmp6, %tmp0_20 : tensor<4x128xi32> loc(#loc159) + %tmp6_31 = arith.addi %tmp6_30, %tmp0_23 : tensor<4x128xi32> loc(#loc160) + %tmp6_32 = tt.addptr %tmp0_25, %tmp6_31 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc161) + %tmp6_33 = tt.load %tmp6_32, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc162) + %tmp6_34 = arith.extf %tmp6_33 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc163) + %tmp2 = arith.mulf %tmp0_29, %tmp0_29 : tensor<4x128xf32> loc(#loc164) + %tmp5 = arith.addf %tmp2, %cst_11 : tensor<4x128xf32> loc(#loc165) + %_tmp4 = arith.select %tmp0_27, %tmp5, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc166) + %tmp8 = arith.mulf %tmp6_34, %tmp6_34 : tensor<4x128xf32> loc(#loc167) + %tmp11 = arith.addf %tmp8, %cst_11 : tensor<4x128xf32> loc(#loc168) + %_tmp10 = arith.select %tmp0_27, %tmp11, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc169) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_98: f32 loc(callsite(#loc1 at #loc170)), %tmp4_99: f32 loc(callsite(#loc1 at #loc170))): + %tmp4_100 = arith.addf %tmp4_98, %tmp4_99 : f32 loc(#loc266) + tt.reduce.return %tmp4_100 : f32 loc(#loc262) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc262) + %tmp4_35 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc171) + %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_98: f32 loc(callsite(#loc1 at #loc172)), %tmp10_99: f32 loc(callsite(#loc1 at #loc172))): + %tmp10_100 = arith.addf %tmp10_98, %tmp10_99 : f32 loc(#loc267) + tt.reduce.return %tmp10_100 : f32 loc(#loc264) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc264) + %tmp10_36 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc173) + %r0_3 = arith.remsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc174) + %r0_4 = arith.divsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc175) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc176) + %tmp58_37 = tt.addptr %tmp58, %r0_base_17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc176) + %tmp58_38 = tt.load %tmp58_37, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc177) + %tmp58_39 = arith.extf %tmp58_38 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc178) + %tmp63 = arith.muli %x1, %cst_8 : tensor<4x1xi32> loc(#loc179) + %tmp63_40 = tt.broadcast %tmp63 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc180) + %tmp63_41 = arith.addi %tmp6, %tmp63_40 : tensor<4x128xi32> loc(#loc180) + %tmp63_42 = tt.splat %in_ptr2 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc181) + %tmp63_43 = tt.addptr %tmp63_42, %tmp63_41 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc181) + %tmp63_44 = tt.load %tmp63_43, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc182) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc183) + %tmp66_45 = tt.addptr %tmp66, %tmp63_41 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc183) + %tmp66_46 = tt.load %tmp66_45, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc184) + %tmp96 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc185) + %tmp96_47 = arith.extf %tmp96 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc186) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc187) + %tmp102_48 = tt.addptr %tmp102, %r0_base_17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc187) + %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr> loc(#loc188) + %tmp102_50 = arith.extf %tmp102_49 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc189) + %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc190) + %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc190) + %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x128xi32> loc(#loc191) + %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x128xi32> loc(#loc192) + %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc193) + %tmp17_54 = arith.addi %tmp17_53, %tmp0_20 : tensor<4x128xi32> loc(#loc193) + %tmp17_55 = arith.addi %tmp17_54, %tmp0_23 : tensor<4x128xi32> loc(#loc194) + %tmp17_56 = tt.addptr %tmp0_25, %tmp17_55 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc195) + %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x128xi1> loc(#loc196) + %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc197) + %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc197) + %tmp17_60 = arith.extf %tmp17_59 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc198) + %tmp20 = arith.divf %tmp10_36, %cst_3 : tensor<4x1xf32> loc(#loc199) + %tmp22 = arith.addf %tmp20, %cst_2 : tensor<4x1xf32> loc(#loc200) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc201) + %tmp24 = tt.broadcast %tmp23 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc202) + %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<4x128xf32> loc(#loc202) + %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc203) + %tmp25_62 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr> -> tensor<4x128x!tt.ptr> loc(#loc203) + %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc204) + %tmp25_64 = arith.extf %tmp25_63 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc205) + %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<4x128xf32> loc(#loc206) + %tmp29 = arith.subf %cst_11, %tmp27 : tensor<4x128xf32> loc(#loc207) + %tmp31 = tt.broadcast %tmp16_51 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc208) + %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc208) + %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc209) + %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc210) + %tmp35_66 = arith.addi %tmp35, %tmp0_20 : tensor<4x128xi32> loc(#loc210) + %tmp35_67 = arith.addi %tmp35_66, %tmp0_23 : tensor<4x128xi32> loc(#loc211) + %tmp35_68 = tt.addptr %tmp0_25, %tmp35_67 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc212) + %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x128xi1> loc(#loc213) + %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc214) + %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc214) + %tmp35_72 = arith.extf %tmp35_71 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc215) + %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<4x128xf32> loc(#loc216) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc217) + %tmp43_73 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr> -> tensor<4x128x!tt.ptr> loc(#loc217) + %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc218) + %tmp43_75 = arith.extf %tmp43_74 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc219) + %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<4x128xf32> loc(#loc220) + %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc221) + %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc221) + %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc222) + %tmp57 = arith.mulf %tmp6_34, %tmp24 : tensor<4x128xf32> loc(#loc223) + %tmp60 = tt.broadcast %tmp58_39 : tensor<1x128xf32> -> tensor<4x128xf32> loc(#loc224) + %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<4x128xf32> loc(#loc224) + %tmp64 = arith.mulf %tmp60_77, %tmp63_44 : tensor<4x128xf32> loc(#loc225) + %tmp67 = arith.mulf %tmp49, %tmp66_46 : tensor<4x128xf32> loc(#loc226) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<4x128xf32> loc(#loc227) + %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x128xi32> loc(#loc228) + %tmp70_78 = tt.broadcast %tmp70 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc229) + %tmp70_79 = arith.addi %tmp70_78, %tmp0_20 : tensor<4x128xi32> loc(#loc229) + %tmp70_80 = arith.addi %tmp70_79, %tmp0_23 : tensor<4x128xi32> loc(#loc230) + %tmp70_81 = tt.addptr %tmp0_25, %tmp70_80 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc231) + %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc232) + %tmp70_83 = arith.extf %tmp70_82 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc233) + %tmp72 = arith.divf %tmp4_35, %cst_3 : tensor<4x1xf32> loc(#loc234) + %tmp73 = arith.addf %tmp72, %cst_2 : tensor<4x1xf32> loc(#loc235) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc236) + %tmp75 = tt.broadcast %tmp74 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc237) + %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<4x128xf32> loc(#loc237) + %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc238) + %tmp76_85 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr> -> tensor<4x128x!tt.ptr> loc(#loc238) + %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc239) + %tmp76_87 = arith.extf %tmp76_86 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc240) + %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<4x128xf32> loc(#loc241) + %tmp80 = arith.subf %cst_11, %tmp78 : tensor<4x128xf32> loc(#loc242) + %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc243) + %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x128xi32> loc(#loc244) + %tmp83_88 = tt.broadcast %tmp83 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc245) + %tmp83_89 = arith.addi %tmp83_88, %tmp0_20 : tensor<4x128xi32> loc(#loc245) + %tmp83_90 = arith.addi %tmp83_89, %tmp0_23 : tensor<4x128xi32> loc(#loc246) + %tmp83_91 = tt.addptr %tmp0_25, %tmp83_90 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc247) + %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc248) + %tmp83_93 = arith.extf %tmp83_92 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc249) + %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<4x128xf32> loc(#loc250) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc251) + %tmp89_94 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr> -> tensor<4x128x!tt.ptr> loc(#loc251) + %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr> loc(#loc252) + %tmp89_96 = arith.extf %tmp89_95 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc253) + %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<4x128xf32> loc(#loc254) + %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc255) + %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc256) + %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<4x128xf32> loc(#loc257) + %tmp104 = tt.broadcast %tmp102_50 : tensor<1x128xf32> -> tensor<4x128xf32> loc(#loc258) + %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<4x128xf32> loc(#loc258) + %tmp107 = arith.mulf %tmp104_97, %tmp63_44 : tensor<4x128xf32> loc(#loc259) + %tmp109 = arith.mulf %tmp95, %tmp66_46 : tensor<4x128xf32> loc(#loc260) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<4x128xf32> loc(#loc261) + %0 = arith.muli %xindex_16, %cst_8 : tensor<4x1xi32> loc(#loc125) + %1 = tt.broadcast %0 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc126) + %2 = arith.addi %tmp6, %1 : tensor<4x128xi32> loc(#loc126) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc127) + %4 = tt.addptr %3, %2 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc127) + %5 = arith.truncf %tmp68 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc128) + tt.store %4, %5, %tmp0_27 : tensor<4x128x!tt.ptr> loc(#loc128) + %6 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc129) + %7 = tt.addptr %6, %2 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc129) + %8 = arith.truncf %tmp110 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc130) + tt.store %7, %8, %tmp0_27 : tensor<4x128x!tt.ptr> loc(#loc130) + tt.return loc(#loc131) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc141 = loc("xoffset"(#loc2)) +#loc142 = loc("xoffset"(#loc3)) +#loc143 = loc("xindex"(#loc4)) +#loc144 = loc("xindex"(#loc5)) +#loc145 = loc("xindex"(#loc6)) +#loc146 = loc("r0_base"(#loc7)) +#loc147 = loc("r0_base"(#loc8)) +#loc148 = loc("x0"(#loc9)) +#loc149 = loc("x1"(#loc10)) +#loc150 = loc("r0_mask"(#loc11)) +#loc151 = loc("tmp0"(#loc12)) +#loc152 = loc("tmp0"(#loc13)) +#loc153 = loc("tmp0"(#loc14)) +#loc154 = loc("tmp0"(#loc15)) +#loc155 = loc("tmp0"(#loc16)) +#loc156 = loc("tmp0"(#loc17)) +#loc157 = loc("tmp0"(#loc18)) +#loc158 = loc("tmp0"(#loc19)) +#loc159 = loc("tmp6"(#loc20)) +#loc160 = loc("tmp6"(#loc21)) +#loc161 = loc("tmp6"(#loc22)) +#loc162 = loc("tmp6"(#loc23)) +#loc163 = loc("tmp6"(#loc24)) +#loc164 = loc("tmp2"(#loc25)) +#loc165 = loc("tmp5"(#loc26)) +#loc166 = loc("_tmp4"(#loc27)) +#loc167 = loc("tmp8"(#loc28)) +#loc168 = loc("tmp11"(#loc29)) +#loc169 = loc("_tmp10"(#loc30)) +#loc171 = loc("tmp4"(#loc34)) +#loc173 = loc("tmp10"(#loc36)) +#loc174 = loc("r0_3"(#loc37)) +#loc175 = loc("r0_4"(#loc38)) +#loc176 = loc("tmp58"(#loc39)) +#loc177 = loc("tmp58"(#loc40)) +#loc178 = loc("tmp58"(#loc41)) +#loc179 = loc("tmp63"(#loc42)) +#loc180 = loc("tmp63"(#loc43)) +#loc181 = loc("tmp63"(#loc44)) +#loc182 = loc("tmp63"(#loc45)) +#loc183 = loc("tmp66"(#loc46)) +#loc184 = loc("tmp66"(#loc47)) +#loc185 = loc("tmp96"(#loc48)) +#loc186 = loc("tmp96"(#loc49)) +#loc187 = loc("tmp102"(#loc50)) +#loc188 = loc("tmp102"(#loc51)) +#loc189 = loc("tmp102"(#loc52)) +#loc190 = loc("tmp16"(#loc53)) +#loc191 = loc("tmp17"(#loc54)) +#loc192 = loc("tmp17"(#loc55)) +#loc193 = loc("tmp17"(#loc56)) +#loc194 = loc("tmp17"(#loc57)) +#loc195 = loc("tmp17"(#loc58)) +#loc196 = loc("tmp17"(#loc59)) +#loc197 = loc("tmp17"(#loc60)) +#loc198 = loc("tmp17"(#loc61)) +#loc199 = loc("tmp20"(#loc62)) +#loc200 = loc("tmp22"(#loc63)) +#loc201 = loc("tmp23"(#loc64)) +#loc202 = loc("tmp24"(#loc65)) +#loc203 = loc("tmp25"(#loc66)) +#loc204 = loc("tmp25"(#loc67)) +#loc205 = loc("tmp25"(#loc68)) +#loc206 = loc("tmp27"(#loc69)) +#loc207 = loc("tmp29"(#loc70)) +#loc208 = loc("tmp31"(#loc71)) +#loc209 = loc("tmp32"(#loc72)) +#loc210 = loc("tmp35"(#loc73)) +#loc211 = loc("tmp35"(#loc74)) +#loc212 = loc("tmp35"(#loc75)) +#loc213 = loc("tmp35"(#loc76)) +#loc214 = loc("tmp35"(#loc77)) +#loc215 = loc("tmp35"(#loc78)) +#loc216 = loc("tmp42"(#loc79)) +#loc217 = loc("tmp43"(#loc80)) +#loc218 = loc("tmp43"(#loc81)) +#loc219 = loc("tmp43"(#loc82)) +#loc220 = loc("tmp45"(#loc83)) +#loc221 = loc("tmp48"(#loc84)) +#loc222 = loc("tmp49"(#loc85)) +#loc223 = loc("tmp57"(#loc86)) +#loc224 = loc("tmp60"(#loc87)) +#loc225 = loc("tmp64"(#loc88)) +#loc226 = loc("tmp67"(#loc89)) +#loc227 = loc("tmp68"(#loc90)) +#loc228 = loc("tmp70"(#loc91)) +#loc229 = loc("tmp70"(#loc92)) +#loc230 = loc("tmp70"(#loc93)) +#loc231 = loc("tmp70"(#loc94)) +#loc232 = loc("tmp70"(#loc95)) +#loc233 = loc("tmp70"(#loc96)) +#loc234 = loc("tmp72"(#loc97)) +#loc235 = loc("tmp73"(#loc98)) +#loc236 = loc("tmp74"(#loc99)) +#loc237 = loc("tmp75"(#loc100)) +#loc238 = loc("tmp76"(#loc101)) +#loc239 = loc("tmp76"(#loc102)) +#loc240 = loc("tmp76"(#loc103)) +#loc241 = loc("tmp78"(#loc104)) +#loc242 = loc("tmp80"(#loc105)) +#loc243 = loc("tmp82"(#loc106)) +#loc244 = loc("tmp83"(#loc107)) +#loc245 = loc("tmp83"(#loc108)) +#loc246 = loc("tmp83"(#loc109)) +#loc247 = loc("tmp83"(#loc110)) +#loc248 = loc("tmp83"(#loc111)) +#loc249 = loc("tmp83"(#loc112)) +#loc250 = loc("tmp88"(#loc113)) +#loc251 = loc("tmp89"(#loc114)) +#loc252 = loc("tmp89"(#loc115)) +#loc253 = loc("tmp89"(#loc116)) +#loc254 = loc("tmp91"(#loc117)) +#loc255 = loc("tmp94"(#loc118)) +#loc256 = loc("tmp95"(#loc119)) +#loc257 = loc("tmp101"(#loc120)) +#loc258 = loc("tmp104"(#loc121)) +#loc259 = loc("tmp107"(#loc122)) +#loc260 = loc("tmp109"(#loc123)) +#loc261 = loc("tmp110"(#loc124)) +#loc262 = loc(callsite(#loc31 at #loc170)) +#loc264 = loc(callsite(#loc31 at #loc172)) +#loc266 = loc(callsite(#loc33 at #loc262)) +#loc267 = loc(callsite(#loc33 at #loc264)) diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..87a9287a16b727ccbefa80c12edc1145ced3dc8d --- /dev/null +++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.json"}} \ No newline at end of file diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..693d9afb8c5a403bd215eafdff67ac38688b25a1 Binary files /dev/null and b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2f70ae834ed89b91859a6198234a96601eb158f5 --- /dev/null +++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"hash": "8b5ac7dad97c70bc8c6fdc1abfc7b2afc9899bf6ba289559402189c0d11cd710", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"} \ No newline at end of file diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..a922420883645796bcc237e1fe85fc1542c99b56 --- /dev/null +++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.llir @@ -0,0 +1,1329 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10 + %15 = mul nuw i32 %13, %14, !dbg !11 + %16 = add nuw i32 %15, %12, !dbg !12 + %17 = shl i32 %16, 6, !dbg !13 + %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14 + %19 = lshr i32 %18, 3, !dbg !14 + %20 = and i32 %19, 31, !dbg !14 + %21 = shl nuw nsw i32 %18, 2, !dbg !14 + %22 = and i32 %21, 60, !dbg !14 + %23 = or disjoint i32 %17, %20, !dbg !15 + %24 = or disjoint i32 %23, 32, !dbg !15 + %25 = or disjoint i32 %17, %22, !dbg !15 + %26 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16 + %27 = shl i32 %26, 6, !dbg !17 + %28 = and i32 %18, 7, !dbg !18 + %29 = shl nuw nsw i32 %28, 3, !dbg !18 + %30 = lshr i32 %18, 4, !dbg !18 + %31 = and i32 %30, 15, !dbg !18 + %32 = or disjoint i32 %29, %27, !dbg !19 + %33 = or disjoint i32 %31, %27, !dbg !19 + %34 = icmp slt i32 %32, 128, !dbg !20 + %35 = icmp slt i32 %33, 128, !dbg !20 + %36 = sdiv i32 %23, 32, !dbg !21 + %37 = sdiv i32 %24, 32, !dbg !21 + %38 = sdiv i32 %25, 32, !dbg !21 + %39 = mul i32 %36, 32, !dbg !22 + %.decomposed = sub i32 %23, %39, !dbg !22 + %40 = mul i32 %38, 32, !dbg !22 + %.decomposed109 = sub i32 %25, %40, !dbg !22 + %41 = icmp slt i32 %23, 8192, !dbg !23 + %42 = icmp slt i32 %25, 8192, !dbg !23 + %43 = shl nsw i32 %.decomposed, 7, !dbg !24 + %44 = add i32 %43, %32, !dbg !25 + %45 = mul i32 %36, 12288, !dbg !26 + %46 = mul i32 %37, 12288, !dbg !26 + %47 = add i32 %44, %45, !dbg !27 + %48 = add i32 %44, %46, !dbg !27 + %49 = sext i32 %47 to i64, !dbg !28 + %50 = getelementptr bfloat, ptr addrspace(1) %0, i64 %49, !dbg !28 + %51 = sext i32 %48 to i64, !dbg !28 + %52 = getelementptr bfloat, ptr addrspace(1) %0, i64 %51, !dbg !28 + %53 = and i1 %34, %41, !dbg !29 + %54 = and i1 %35, %42, !dbg !29 + %55 = icmp slt i32 %23, 8160, !dbg !30 + %56 = and i1 %34, %55, !dbg !30 + %57 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !31 + %58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %50, i64 %57, i1 %53) #5, !dbg !31 + %59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !31 + %60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !31 + %61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !31 + %62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !31 + %63 = insertelement <2 x i32> poison, i32 %59, i64 0, !dbg !31 + %64 = insertelement <2 x i32> %63, i32 %60, i64 1, !dbg !31 + %65 = lshr <2 x i32> %64, splat (i32 16), !dbg !31 + %66 = trunc nuw <2 x i32> %65 to <2 x i16>, !dbg !31 + %67 = insertelement <2 x i32> poison, i32 %61, i64 0, !dbg !31 + %68 = insertelement <2 x i32> %67, i32 %62, i64 1, !dbg !31 + %69 = lshr <2 x i32> %68, splat (i32 16), !dbg !31 + %70 = trunc nuw <2 x i32> %69 to <2 x i16>, !dbg !31 + %71 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !31 + %72 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %52, i64 %71, i1 %56) #5, !dbg !31 + %73 = extractvalue { i32, i32, i32, i32 } %72, 0, !dbg !31 + %74 = extractvalue { i32, i32, i32, i32 } %72, 1, !dbg !31 + %75 = extractvalue { i32, i32, i32, i32 } %72, 2, !dbg !31 + %76 = extractvalue { i32, i32, i32, i32 } %72, 3, !dbg !31 + %77 = insertelement <2 x i32> poison, i32 %73, i64 0, !dbg !31 + %78 = insertelement <2 x i32> %77, i32 %74, i64 1, !dbg !31 + %79 = lshr <2 x i32> %78, splat (i32 16), !dbg !31 + %80 = trunc nuw <2 x i32> %79 to <2 x i16>, !dbg !31 + %81 = insertelement <2 x i32> poison, i32 %75, i64 0, !dbg !31 + %82 = insertelement <2 x i32> %81, i32 %76, i64 1, !dbg !31 + %83 = lshr <2 x i32> %82, splat (i32 16), !dbg !31 + %84 = trunc nuw <2 x i32> %83 to <2 x i16>, !dbg !31 + %85 = and i32 %18, 24, !dbg !32 + %86 = shl nuw nsw i32 %85, 5, !dbg !32 + %87 = shl nuw nsw i32 %28, 4, !dbg !32 + %88 = lshr exact i32 %85, 1, !dbg !32 + %89 = and i32 %18, 96, !dbg !32 + %90 = lshr exact i32 %89, 3, !dbg !32 + %91 = and i32 %18, 128, !dbg !32 + %92 = icmp eq i32 %91, 0, !dbg !32 + %93 = select i1 %92, i32 0, i32 1040, !dbg !32 + %94 = xor i32 %88, %90, !dbg !32 + %95 = or disjoint i32 %94, %86, !dbg !32 + %96 = or disjoint i32 %95, %87, !dbg !32 + %97 = xor i32 %96, %93, !dbg !32 + %98 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %97, !dbg !32 + %99 = trunc i32 %59 to i16, !dbg !32 + %100 = trunc i32 %60 to i16, !dbg !32 + %101 = insertelement <2 x i16> poison, i16 %99, i64 0, !dbg !32 + %102 = insertelement <2 x i16> %101, i16 %100, i64 1, !dbg !32 + store <2 x i16> %102, ptr addrspace(3) %98, align 4, !dbg !32 + %103 = getelementptr inbounds nuw i8, ptr addrspace(3) %98, i32 128, !dbg !32 + %104 = trunc i32 %61 to i16, !dbg !32 + %105 = trunc i32 %62 to i16, !dbg !32 + %106 = insertelement <2 x i16> poison, i16 %104, i64 0, !dbg !32 + %107 = insertelement <2 x i16> %106, i16 %105, i64 1, !dbg !32 + store <2 x i16> %107, ptr addrspace(3) %103, align 4, !dbg !32 + %108 = xor i32 %97, 4160, !dbg !32 + %109 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %108, !dbg !32 + store <2 x i16> %66, ptr addrspace(3) %109, align 4, !dbg !32 + %110 = getelementptr inbounds nuw i8, ptr addrspace(3) %109, i32 128, !dbg !32 + store <2 x i16> %70, ptr addrspace(3) %110, align 4, !dbg !32 + %111 = xor i32 %97, 2080, !dbg !32 + %112 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %111, !dbg !32 + %113 = trunc i32 %73 to i16, !dbg !32 + %114 = trunc i32 %74 to i16, !dbg !32 + %115 = insertelement <2 x i16> poison, i16 %113, i64 0, !dbg !32 + %116 = insertelement <2 x i16> %115, i16 %114, i64 1, !dbg !32 + store <2 x i16> %116, ptr addrspace(3) %112, align 4, !dbg !32 + %117 = getelementptr inbounds nuw i8, ptr addrspace(3) %112, i32 128, !dbg !32 + %118 = trunc i32 %75 to i16, !dbg !32 + %119 = trunc i32 %76 to i16, !dbg !32 + %120 = insertelement <2 x i16> poison, i16 %118, i64 0, !dbg !32 + %121 = insertelement <2 x i16> %120, i16 %119, i64 1, !dbg !32 + store <2 x i16> %121, ptr addrspace(3) %117, align 4, !dbg !32 + %122 = xor i32 %97, 6240, !dbg !32 + %123 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %122, !dbg !32 + store <2 x i16> %80, ptr addrspace(3) %123, align 4, !dbg !32 + %124 = getelementptr inbounds nuw i8, ptr addrspace(3) %123, i32 128, !dbg !32 + store <2 x i16> %84, ptr addrspace(3) %124, align 4, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !32 + %125 = and i32 %18, 28, !dbg !32 + %126 = shl nuw nsw i32 %125, 8, !dbg !32 + %127 = and i32 %21, 124, !dbg !32 + %128 = and i32 %30, 2, !dbg !32 + %129 = shl nuw nsw i32 %18, 1, !dbg !32 + %130 = and i32 %129, 128, !dbg !32 + %131 = lshr exact i32 %91, 3, !dbg !32 + %132 = or disjoint i32 %128, %130, !dbg !32 + %133 = or disjoint i32 %126, %127, !dbg !32 + %134 = xor i32 %133, %131, !dbg !32 + %135 = or disjoint i32 %132, %134, !dbg !32 + %136 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %135, !dbg !32 + %137 = load bfloat, ptr addrspace(3) %136, align 2, !dbg !32 + %138 = xor i32 %135, 260, !dbg !32 + %139 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %138, !dbg !32 + %140 = load bfloat, ptr addrspace(3) %139, align 2, !dbg !32 + %141 = xor i32 %135, 520, !dbg !32 + %142 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %141, !dbg !32 + %143 = load bfloat, ptr addrspace(3) %142, align 2, !dbg !32 + %144 = xor i32 %135, 780, !dbg !32 + %145 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %144, !dbg !32 + %146 = load bfloat, ptr addrspace(3) %145, align 2, !dbg !32 + %147 = xor i32 %135, 32, !dbg !32 + %148 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %147, !dbg !32 + %149 = load bfloat, ptr addrspace(3) %148, align 2, !dbg !32 + %150 = xor i32 %135, 292, !dbg !32 + %151 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %150, !dbg !32 + %152 = load bfloat, ptr addrspace(3) %151, align 2, !dbg !32 + %153 = xor i32 %135, 552, !dbg !32 + %154 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %153, !dbg !32 + %155 = load bfloat, ptr addrspace(3) %154, align 2, !dbg !32 + %156 = xor i32 %135, 812, !dbg !32 + %157 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %156, !dbg !32 + %158 = load bfloat, ptr addrspace(3) %157, align 2, !dbg !32 + %159 = xor i32 %135, 64, !dbg !32 + %160 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %159, !dbg !32 + %161 = load bfloat, ptr addrspace(3) %160, align 2, !dbg !32 + %162 = xor i32 %135, 324, !dbg !32 + %163 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %162, !dbg !32 + %164 = load bfloat, ptr addrspace(3) %163, align 2, !dbg !32 + %165 = xor i32 %135, 584, !dbg !32 + %166 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %165, !dbg !32 + %167 = load bfloat, ptr addrspace(3) %166, align 2, !dbg !32 + %168 = xor i32 %135, 844, !dbg !32 + %169 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %168, !dbg !32 + %170 = load bfloat, ptr addrspace(3) %169, align 2, !dbg !32 + %171 = xor i32 %135, 96, !dbg !32 + %172 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %171, !dbg !32 + %173 = load bfloat, ptr addrspace(3) %172, align 2, !dbg !32 + %174 = xor i32 %135, 356, !dbg !32 + %175 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %174, !dbg !32 + %176 = load bfloat, ptr addrspace(3) %175, align 2, !dbg !32 + %177 = xor i32 %135, 616, !dbg !32 + %178 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %177, !dbg !32 + %179 = load bfloat, ptr addrspace(3) %178, align 2, !dbg !32 + %180 = xor i32 %135, 876, !dbg !32 + %181 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %180, !dbg !32 + %182 = load bfloat, ptr addrspace(3) %181, align 2, !dbg !32 + %183 = fpext bfloat %137 to float, !dbg !32 + %184 = fpext bfloat %140 to float, !dbg !32 + %185 = fpext bfloat %143 to float, !dbg !32 + %186 = fpext bfloat %146 to float, !dbg !32 + %187 = fpext bfloat %149 to float, !dbg !32 + %188 = fpext bfloat %152 to float, !dbg !32 + %189 = fpext bfloat %155 to float, !dbg !32 + %190 = fpext bfloat %158 to float, !dbg !32 + %191 = fpext bfloat %161 to float, !dbg !32 + %192 = fpext bfloat %164 to float, !dbg !32 + %193 = fpext bfloat %167 to float, !dbg !32 + %194 = fpext bfloat %170 to float, !dbg !32 + %195 = fpext bfloat %173 to float, !dbg !32 + %196 = fpext bfloat %176 to float, !dbg !32 + %197 = fpext bfloat %179 to float, !dbg !32 + %198 = fpext bfloat %182 to float, !dbg !32 + %199 = sext i32 %25 to i64, !dbg !33 + %200 = getelementptr float, ptr addrspace(1) %1, i64 %199, !dbg !33 + %201 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !34 + %202 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %200, i64 %201, i1 %54) #5, !dbg !34 + %203 = extractvalue { i32, i32, i32, i32 } %202, 0, !dbg !34 + %204 = extractvalue { i32, i32, i32, i32 } %202, 1, !dbg !34 + %205 = extractvalue { i32, i32, i32, i32 } %202, 2, !dbg !34 + %206 = extractvalue { i32, i32, i32, i32 } %202, 3, !dbg !34 + %207 = bitcast i32 %203 to float, !dbg !34 + %208 = bitcast i32 %204 to float, !dbg !34 + %209 = bitcast i32 %205 to float, !dbg !34 + %210 = bitcast i32 %206 to float, !dbg !34 + %211 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !34 + %212 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %200, i64 %211, i1 %54) #5, !dbg !34 + %213 = extractvalue { i32, i32, i32, i32 } %212, 0, !dbg !34 + %214 = extractvalue { i32, i32, i32, i32 } %212, 1, !dbg !34 + %215 = extractvalue { i32, i32, i32, i32 } %212, 2, !dbg !34 + %216 = extractvalue { i32, i32, i32, i32 } %212, 3, !dbg !34 + %217 = bitcast i32 %213 to float, !dbg !34 + %218 = bitcast i32 %214 to float, !dbg !34 + %219 = bitcast i32 %215 to float, !dbg !34 + %220 = bitcast i32 %216 to float, !dbg !34 + %221 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !34 + %222 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %200, i64 %221, i1 %54) #5, !dbg !34 + %223 = extractvalue { i32, i32, i32, i32 } %222, 0, !dbg !34 + %224 = extractvalue { i32, i32, i32, i32 } %222, 1, !dbg !34 + %225 = extractvalue { i32, i32, i32, i32 } %222, 2, !dbg !34 + %226 = extractvalue { i32, i32, i32, i32 } %222, 3, !dbg !34 + %227 = bitcast i32 %223 to float, !dbg !34 + %228 = bitcast i32 %224 to float, !dbg !34 + %229 = bitcast i32 %225 to float, !dbg !34 + %230 = bitcast i32 %226 to float, !dbg !34 + %231 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !34 + %232 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %200, i64 %231, i1 %54) #5, !dbg !34 + %233 = extractvalue { i32, i32, i32, i32 } %232, 0, !dbg !34 + %234 = extractvalue { i32, i32, i32, i32 } %232, 1, !dbg !34 + %235 = extractvalue { i32, i32, i32, i32 } %232, 2, !dbg !34 + %236 = extractvalue { i32, i32, i32, i32 } %232, 3, !dbg !34 + %237 = bitcast i32 %233 to float, !dbg !34 + %238 = bitcast i32 %234 to float, !dbg !34 + %239 = bitcast i32 %235 to float, !dbg !34 + %240 = bitcast i32 %236 to float, !dbg !34 + %241 = tail call float @llvm.nvvm.div.full(float %207, float 1.280000e+02), !dbg !35 + %242 = tail call float @llvm.nvvm.div.full(float %208, float 1.280000e+02), !dbg !35 + %243 = tail call float @llvm.nvvm.div.full(float %209, float 1.280000e+02), !dbg !35 + %244 = tail call float @llvm.nvvm.div.full(float %210, float 1.280000e+02), !dbg !35 + %245 = tail call float @llvm.nvvm.div.full(float %217, float 1.280000e+02), !dbg !35 + %246 = tail call float @llvm.nvvm.div.full(float %218, float 1.280000e+02), !dbg !35 + %247 = tail call float @llvm.nvvm.div.full(float %219, float 1.280000e+02), !dbg !35 + %248 = tail call float @llvm.nvvm.div.full(float %220, float 1.280000e+02), !dbg !35 + %249 = tail call float @llvm.nvvm.div.full(float %227, float 1.280000e+02), !dbg !35 + %250 = tail call float @llvm.nvvm.div.full(float %228, float 1.280000e+02), !dbg !35 + %251 = tail call float @llvm.nvvm.div.full(float %229, float 1.280000e+02), !dbg !35 + %252 = tail call float @llvm.nvvm.div.full(float %230, float 1.280000e+02), !dbg !35 + %253 = tail call float @llvm.nvvm.div.full(float %237, float 1.280000e+02), !dbg !35 + %254 = tail call float @llvm.nvvm.div.full(float %238, float 1.280000e+02), !dbg !35 + %255 = tail call float @llvm.nvvm.div.full(float %239, float 1.280000e+02), !dbg !35 + %256 = tail call float @llvm.nvvm.div.full(float %240, float 1.280000e+02), !dbg !35 + %257 = fadd float %241, 0x3EB0C6F7A0000000, !dbg !36 + %258 = fadd float %242, 0x3EB0C6F7A0000000, !dbg !36 + %259 = fadd float %243, 0x3EB0C6F7A0000000, !dbg !36 + %260 = fadd float %244, 0x3EB0C6F7A0000000, !dbg !36 + %261 = fadd float %245, 0x3EB0C6F7A0000000, !dbg !36 + %262 = fadd float %246, 0x3EB0C6F7A0000000, !dbg !36 + %263 = fadd float %247, 0x3EB0C6F7A0000000, !dbg !36 + %264 = fadd float %248, 0x3EB0C6F7A0000000, !dbg !36 + %265 = fadd float %249, 0x3EB0C6F7A0000000, !dbg !36 + %266 = fadd float %250, 0x3EB0C6F7A0000000, !dbg !36 + %267 = fadd float %251, 0x3EB0C6F7A0000000, !dbg !36 + %268 = fadd float %252, 0x3EB0C6F7A0000000, !dbg !36 + %269 = fadd float %253, 0x3EB0C6F7A0000000, !dbg !36 + %270 = fadd float %254, 0x3EB0C6F7A0000000, !dbg !36 + %271 = fadd float %255, 0x3EB0C6F7A0000000, !dbg !36 + %272 = fadd float %256, 0x3EB0C6F7A0000000, !dbg !36 + %273 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i = icmp eq i32 %273, 0, !dbg !37 + br i1 %.not.i, label %276, label %274, !dbg !37 + +274: ; preds = %11 + %275 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %257), !dbg !37 + br label %__nv_rsqrtf.exit, !dbg !37 + +276: ; preds = %11 + %277 = tail call float @llvm.nvvm.rsqrt.approx.f(float %257), !dbg !37 + br label %__nv_rsqrtf.exit, !dbg !37 + +__nv_rsqrtf.exit: ; preds = %274, %276 + %.0.i = phi float [ %275, %274 ], [ %277, %276 ], !dbg !37 + %278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i16 = icmp eq i32 %278, 0, !dbg !37 + br i1 %.not.i16, label %281, label %279, !dbg !37 + +279: ; preds = %__nv_rsqrtf.exit + %280 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %258), !dbg !37 + br label %__nv_rsqrtf.exit18, !dbg !37 + +281: ; preds = %__nv_rsqrtf.exit + %282 = tail call float @llvm.nvvm.rsqrt.approx.f(float %258), !dbg !37 + br label %__nv_rsqrtf.exit18, !dbg !37 + +__nv_rsqrtf.exit18: ; preds = %279, %281 + %.0.i17 = phi float [ %280, %279 ], [ %282, %281 ], !dbg !37 + %283 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i19 = icmp eq i32 %283, 0, !dbg !37 + br i1 %.not.i19, label %286, label %284, !dbg !37 + +284: ; preds = %__nv_rsqrtf.exit18 + %285 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %259), !dbg !37 + br label %__nv_rsqrtf.exit21, !dbg !37 + +286: ; preds = %__nv_rsqrtf.exit18 + %287 = tail call float @llvm.nvvm.rsqrt.approx.f(float %259), !dbg !37 + br label %__nv_rsqrtf.exit21, !dbg !37 + +__nv_rsqrtf.exit21: ; preds = %284, %286 + %.0.i20 = phi float [ %285, %284 ], [ %287, %286 ], !dbg !37 + %288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i22 = icmp eq i32 %288, 0, !dbg !37 + br i1 %.not.i22, label %291, label %289, !dbg !37 + +289: ; preds = %__nv_rsqrtf.exit21 + %290 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %260), !dbg !37 + br label %__nv_rsqrtf.exit24, !dbg !37 + +291: ; preds = %__nv_rsqrtf.exit21 + %292 = tail call float @llvm.nvvm.rsqrt.approx.f(float %260), !dbg !37 + br label %__nv_rsqrtf.exit24, !dbg !37 + +__nv_rsqrtf.exit24: ; preds = %289, %291 + %.0.i23 = phi float [ %290, %289 ], [ %292, %291 ], !dbg !37 + %293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i25 = icmp eq i32 %293, 0, !dbg !37 + br i1 %.not.i25, label %296, label %294, !dbg !37 + +294: ; preds = %__nv_rsqrtf.exit24 + %295 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %261), !dbg !37 + br label %__nv_rsqrtf.exit27, !dbg !37 + +296: ; preds = %__nv_rsqrtf.exit24 + %297 = tail call float @llvm.nvvm.rsqrt.approx.f(float %261), !dbg !37 + br label %__nv_rsqrtf.exit27, !dbg !37 + +__nv_rsqrtf.exit27: ; preds = %294, %296 + %.0.i26 = phi float [ %295, %294 ], [ %297, %296 ], !dbg !37 + %298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i28 = icmp eq i32 %298, 0, !dbg !37 + br i1 %.not.i28, label %301, label %299, !dbg !37 + +299: ; preds = %__nv_rsqrtf.exit27 + %300 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %262), !dbg !37 + br label %__nv_rsqrtf.exit30, !dbg !37 + +301: ; preds = %__nv_rsqrtf.exit27 + %302 = tail call float @llvm.nvvm.rsqrt.approx.f(float %262), !dbg !37 + br label %__nv_rsqrtf.exit30, !dbg !37 + +__nv_rsqrtf.exit30: ; preds = %299, %301 + %.0.i29 = phi float [ %300, %299 ], [ %302, %301 ], !dbg !37 + %303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i31 = icmp eq i32 %303, 0, !dbg !37 + br i1 %.not.i31, label %306, label %304, !dbg !37 + +304: ; preds = %__nv_rsqrtf.exit30 + %305 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %263), !dbg !37 + br label %__nv_rsqrtf.exit33, !dbg !37 + +306: ; preds = %__nv_rsqrtf.exit30 + %307 = tail call float @llvm.nvvm.rsqrt.approx.f(float %263), !dbg !37 + br label %__nv_rsqrtf.exit33, !dbg !37 + +__nv_rsqrtf.exit33: ; preds = %304, %306 + %.0.i32 = phi float [ %305, %304 ], [ %307, %306 ], !dbg !37 + %308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i34 = icmp eq i32 %308, 0, !dbg !37 + br i1 %.not.i34, label %311, label %309, !dbg !37 + +309: ; preds = %__nv_rsqrtf.exit33 + %310 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %264), !dbg !37 + br label %__nv_rsqrtf.exit36, !dbg !37 + +311: ; preds = %__nv_rsqrtf.exit33 + %312 = tail call float @llvm.nvvm.rsqrt.approx.f(float %264), !dbg !37 + br label %__nv_rsqrtf.exit36, !dbg !37 + +__nv_rsqrtf.exit36: ; preds = %309, %311 + %.0.i35 = phi float [ %310, %309 ], [ %312, %311 ], !dbg !37 + %313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i37 = icmp eq i32 %313, 0, !dbg !37 + br i1 %.not.i37, label %316, label %314, !dbg !37 + +314: ; preds = %__nv_rsqrtf.exit36 + %315 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %265), !dbg !37 + br label %__nv_rsqrtf.exit39, !dbg !37 + +316: ; preds = %__nv_rsqrtf.exit36 + %317 = tail call float @llvm.nvvm.rsqrt.approx.f(float %265), !dbg !37 + br label %__nv_rsqrtf.exit39, !dbg !37 + +__nv_rsqrtf.exit39: ; preds = %314, %316 + %.0.i38 = phi float [ %315, %314 ], [ %317, %316 ], !dbg !37 + %318 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i40 = icmp eq i32 %318, 0, !dbg !37 + br i1 %.not.i40, label %321, label %319, !dbg !37 + +319: ; preds = %__nv_rsqrtf.exit39 + %320 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %266), !dbg !37 + br label %__nv_rsqrtf.exit42, !dbg !37 + +321: ; preds = %__nv_rsqrtf.exit39 + %322 = tail call float @llvm.nvvm.rsqrt.approx.f(float %266), !dbg !37 + br label %__nv_rsqrtf.exit42, !dbg !37 + +__nv_rsqrtf.exit42: ; preds = %319, %321 + %.0.i41 = phi float [ %320, %319 ], [ %322, %321 ], !dbg !37 + %323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i43 = icmp eq i32 %323, 0, !dbg !37 + br i1 %.not.i43, label %326, label %324, !dbg !37 + +324: ; preds = %__nv_rsqrtf.exit42 + %325 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %267), !dbg !37 + br label %__nv_rsqrtf.exit45, !dbg !37 + +326: ; preds = %__nv_rsqrtf.exit42 + %327 = tail call float @llvm.nvvm.rsqrt.approx.f(float %267), !dbg !37 + br label %__nv_rsqrtf.exit45, !dbg !37 + +__nv_rsqrtf.exit45: ; preds = %324, %326 + %.0.i44 = phi float [ %325, %324 ], [ %327, %326 ], !dbg !37 + %328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i46 = icmp eq i32 %328, 0, !dbg !37 + br i1 %.not.i46, label %331, label %329, !dbg !37 + +329: ; preds = %__nv_rsqrtf.exit45 + %330 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %268), !dbg !37 + br label %__nv_rsqrtf.exit48, !dbg !37 + +331: ; preds = %__nv_rsqrtf.exit45 + %332 = tail call float @llvm.nvvm.rsqrt.approx.f(float %268), !dbg !37 + br label %__nv_rsqrtf.exit48, !dbg !37 + +__nv_rsqrtf.exit48: ; preds = %329, %331 + %.0.i47 = phi float [ %330, %329 ], [ %332, %331 ], !dbg !37 + %333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i49 = icmp eq i32 %333, 0, !dbg !37 + br i1 %.not.i49, label %336, label %334, !dbg !37 + +334: ; preds = %__nv_rsqrtf.exit48 + %335 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %269), !dbg !37 + br label %__nv_rsqrtf.exit51, !dbg !37 + +336: ; preds = %__nv_rsqrtf.exit48 + %337 = tail call float @llvm.nvvm.rsqrt.approx.f(float %269), !dbg !37 + br label %__nv_rsqrtf.exit51, !dbg !37 + +__nv_rsqrtf.exit51: ; preds = %334, %336 + %.0.i50 = phi float [ %335, %334 ], [ %337, %336 ], !dbg !37 + %338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i52 = icmp eq i32 %338, 0, !dbg !37 + br i1 %.not.i52, label %341, label %339, !dbg !37 + +339: ; preds = %__nv_rsqrtf.exit51 + %340 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %270), !dbg !37 + br label %__nv_rsqrtf.exit54, !dbg !37 + +341: ; preds = %__nv_rsqrtf.exit51 + %342 = tail call float @llvm.nvvm.rsqrt.approx.f(float %270), !dbg !37 + br label %__nv_rsqrtf.exit54, !dbg !37 + +__nv_rsqrtf.exit54: ; preds = %339, %341 + %.0.i53 = phi float [ %340, %339 ], [ %342, %341 ], !dbg !37 + %343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i55 = icmp eq i32 %343, 0, !dbg !37 + br i1 %.not.i55, label %346, label %344, !dbg !37 + +344: ; preds = %__nv_rsqrtf.exit54 + %345 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %271), !dbg !37 + br label %__nv_rsqrtf.exit57, !dbg !37 + +346: ; preds = %__nv_rsqrtf.exit54 + %347 = tail call float @llvm.nvvm.rsqrt.approx.f(float %271), !dbg !37 + br label %__nv_rsqrtf.exit57, !dbg !37 + +__nv_rsqrtf.exit57: ; preds = %344, %346 + %.0.i56 = phi float [ %345, %344 ], [ %347, %346 ], !dbg !37 + %348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37 + %.not.i58 = icmp eq i32 %348, 0, !dbg !37 + br i1 %.not.i58, label %351, label %349, !dbg !37 + +349: ; preds = %__nv_rsqrtf.exit57 + %350 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %272), !dbg !37 + br label %__nv_rsqrtf.exit60, !dbg !37 + +351: ; preds = %__nv_rsqrtf.exit57 + %352 = tail call float @llvm.nvvm.rsqrt.approx.f(float %272), !dbg !37 + br label %__nv_rsqrtf.exit60, !dbg !37 + +__nv_rsqrtf.exit60: ; preds = %349, %351 + %.0.i59 = phi float [ %350, %349 ], [ %352, %351 ], !dbg !37 + %353 = fmul float %.0.i, %183, !dbg !38 + %354 = fmul float %.0.i17, %184, !dbg !38 + %355 = fmul float %.0.i20, %185, !dbg !38 + %356 = fmul float %.0.i23, %186, !dbg !38 + %357 = fmul float %.0.i26, %187, !dbg !38 + %358 = fmul float %.0.i29, %188, !dbg !38 + %359 = fmul float %.0.i32, %189, !dbg !38 + %360 = fmul float %.0.i35, %190, !dbg !38 + %361 = fmul float %.0.i38, %191, !dbg !38 + %362 = fmul float %.0.i41, %192, !dbg !38 + %363 = fmul float %.0.i44, %193, !dbg !38 + %364 = fmul float %.0.i47, %194, !dbg !38 + %365 = fmul float %.0.i50, %195, !dbg !38 + %366 = fmul float %.0.i53, %196, !dbg !38 + %367 = fmul float %.0.i56, %197, !dbg !38 + %368 = fmul float %.0.i59, %198, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %369 = shl nuw nsw i32 %125, 9, !dbg !38 + %370 = shl nuw nsw i32 %89, 2, !dbg !38 + %371 = lshr i32 %18, 1, !dbg !38 + %372 = and i32 %371, 76, !dbg !38 + %373 = or disjoint i32 %369, %87, !dbg !38 + %374 = or disjoint i32 %370, %372, !dbg !38 + %375 = xor i32 %373, %374, !dbg !38 + %376 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %375, !dbg !38 + store float %353, ptr addrspace(3) %376, align 4, !dbg !38 + %377 = xor i32 %375, 528, !dbg !38 + %378 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %377, !dbg !38 + store float %354, ptr addrspace(3) %378, align 4, !dbg !38 + %379 = xor i32 %375, 1056, !dbg !38 + %380 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %379, !dbg !38 + store float %355, ptr addrspace(3) %380, align 4, !dbg !38 + %381 = xor i32 %375, 1584, !dbg !38 + %382 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %381, !dbg !38 + store float %356, ptr addrspace(3) %382, align 4, !dbg !38 + %383 = xor i32 %375, 4, !dbg !38 + %384 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %383, !dbg !38 + store float %357, ptr addrspace(3) %384, align 4, !dbg !38 + %385 = xor i32 %375, 532, !dbg !38 + %386 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %385, !dbg !38 + store float %358, ptr addrspace(3) %386, align 4, !dbg !38 + %387 = xor i32 %375, 1060, !dbg !38 + %388 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %387, !dbg !38 + store float %359, ptr addrspace(3) %388, align 4, !dbg !38 + %389 = xor i32 %375, 1588, !dbg !38 + %390 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %389, !dbg !38 + store float %360, ptr addrspace(3) %390, align 4, !dbg !38 + %391 = xor i32 %375, 8, !dbg !38 + %392 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %391, !dbg !38 + store float %361, ptr addrspace(3) %392, align 4, !dbg !38 + %393 = xor i32 %375, 536, !dbg !38 + %394 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %393, !dbg !38 + store float %362, ptr addrspace(3) %394, align 4, !dbg !38 + %395 = xor i32 %375, 1064, !dbg !38 + %396 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %395, !dbg !38 + store float %363, ptr addrspace(3) %396, align 4, !dbg !38 + %397 = xor i32 %375, 1592, !dbg !38 + %398 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %397, !dbg !38 + store float %364, ptr addrspace(3) %398, align 4, !dbg !38 + %399 = xor i32 %375, 12, !dbg !38 + %400 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %399, !dbg !38 + store float %365, ptr addrspace(3) %400, align 4, !dbg !38 + %401 = xor i32 %375, 540, !dbg !38 + %402 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %401, !dbg !38 + store float %366, ptr addrspace(3) %402, align 4, !dbg !38 + %403 = xor i32 %375, 1068, !dbg !38 + %404 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %403, !dbg !38 + store float %367, ptr addrspace(3) %404, align 4, !dbg !38 + %405 = xor i32 %375, 1596, !dbg !38 + %406 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %405, !dbg !38 + store float %368, ptr addrspace(3) %406, align 4, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %407 = shl nuw nsw i32 %18, 6, !dbg !38 + %408 = and i32 %407, 1600, !dbg !38 + %409 = and i32 %129, 60, !dbg !38 + %410 = lshr exact i32 %89, 1, !dbg !38 + %411 = select i1 %92, i32 0, i32 2112, !dbg !38 + %412 = or disjoint i32 %408, %409, !dbg !38 + %413 = or disjoint i32 %411, %410, !dbg !38 + %414 = xor i32 %413, %412, !dbg !38 + %415 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %414, !dbg !38 + %416 = load float, ptr addrspace(3) %415, align 4, !dbg !38 + %417 = getelementptr inbounds nuw i8, ptr addrspace(3) %415, i32 128, !dbg !38 + %418 = load float, ptr addrspace(3) %417, align 4, !dbg !38 + %419 = getelementptr inbounds nuw i8, ptr addrspace(3) %415, i32 256, !dbg !38 + %420 = load float, ptr addrspace(3) %419, align 4, !dbg !38 + %421 = getelementptr inbounds nuw i8, ptr addrspace(3) %415, i32 384, !dbg !38 + %422 = load float, ptr addrspace(3) %421, align 4, !dbg !38 + %423 = xor i32 %414, 8200, !dbg !38 + %424 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %423, !dbg !38 + %425 = load float, ptr addrspace(3) %424, align 4, !dbg !38 + %426 = getelementptr inbounds nuw i8, ptr addrspace(3) %424, i32 128, !dbg !38 + %427 = load float, ptr addrspace(3) %426, align 4, !dbg !38 + %428 = getelementptr inbounds nuw i8, ptr addrspace(3) %424, i32 256, !dbg !38 + %429 = load float, ptr addrspace(3) %428, align 4, !dbg !38 + %430 = getelementptr inbounds nuw i8, ptr addrspace(3) %424, i32 384, !dbg !38 + %431 = load float, ptr addrspace(3) %430, align 4, !dbg !38 + %432 = xor i32 %414, 4100, !dbg !38 + %433 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %432, !dbg !38 + %434 = load float, ptr addrspace(3) %433, align 4, !dbg !38 + %435 = getelementptr inbounds nuw i8, ptr addrspace(3) %433, i32 128, !dbg !38 + %436 = load float, ptr addrspace(3) %435, align 4, !dbg !38 + %437 = getelementptr inbounds nuw i8, ptr addrspace(3) %433, i32 256, !dbg !38 + %438 = load float, ptr addrspace(3) %437, align 4, !dbg !38 + %439 = getelementptr inbounds nuw i8, ptr addrspace(3) %433, i32 384, !dbg !38 + %440 = load float, ptr addrspace(3) %439, align 4, !dbg !38 + %441 = xor i32 %414, 12300, !dbg !38 + %442 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %441, !dbg !38 + %443 = load float, ptr addrspace(3) %442, align 4, !dbg !38 + %444 = getelementptr inbounds nuw i8, ptr addrspace(3) %442, i32 128, !dbg !38 + %445 = load float, ptr addrspace(3) %444, align 4, !dbg !38 + %446 = getelementptr inbounds nuw i8, ptr addrspace(3) %442, i32 256, !dbg !38 + %447 = load float, ptr addrspace(3) %446, align 4, !dbg !38 + %448 = getelementptr inbounds nuw i8, ptr addrspace(3) %442, i32 384, !dbg !38 + %449 = load float, ptr addrspace(3) %448, align 4, !dbg !38 + %450 = sext i32 %32 to i64, !dbg !39 + %451 = getelementptr bfloat, ptr addrspace(1) %2, i64 %450, !dbg !39 + %452 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !40 + %453 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %451, i64 %452, i1 %53) #5, !dbg !40 + %454 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !40 + %455 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %451, i64 %454, i1 %56) #5, !dbg !40 + %456 = add i32 %47, -3145728, !dbg !41 + %457 = add i32 %48, -3145728, !dbg !41 + %458 = sext i32 %456 to i64, !dbg !42 + %459 = getelementptr bfloat, ptr addrspace(1) %3, i64 %458, !dbg !42 + %460 = sext i32 %457 to i64, !dbg !42 + %461 = getelementptr bfloat, ptr addrspace(1) %3, i64 %460, !dbg !42 + %462 = add i32 %17, -8192, !dbg !43 + %463 = icmp ult i32 %462, 65536, !dbg !43 + %464 = and i1 %34, %463, !dbg !43 + %465 = add i32 %17, -8160, !dbg !43 + %466 = icmp ult i32 %465, 65568, !dbg !43 + %467 = and i1 %34, %466, !dbg !43 + %468 = and i1 %35, %463, !dbg !43 + %469 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !44 + %470 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %459, i64 %469, i1 %464) #5, !dbg !44 + %471 = extractvalue { i32, i32, i32, i32 } %470, 0, !dbg !44 + %472 = extractvalue { i32, i32, i32, i32 } %470, 1, !dbg !44 + %473 = extractvalue { i32, i32, i32, i32 } %470, 2, !dbg !44 + %474 = extractvalue { i32, i32, i32, i32 } %470, 3, !dbg !44 + %475 = insertelement <2 x i32> poison, i32 %471, i64 0, !dbg !44 + %476 = insertelement <2 x i32> %475, i32 %472, i64 1, !dbg !44 + %477 = lshr <2 x i32> %476, splat (i32 16), !dbg !44 + %478 = trunc nuw <2 x i32> %477 to <2 x i16>, !dbg !44 + %479 = insertelement <2 x i32> poison, i32 %473, i64 0, !dbg !44 + %480 = insertelement <2 x i32> %479, i32 %474, i64 1, !dbg !44 + %481 = lshr <2 x i32> %480, splat (i32 16), !dbg !44 + %482 = trunc nuw <2 x i32> %481 to <2 x i16>, !dbg !44 + %483 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !44 + %484 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %461, i64 %483, i1 %467) #5, !dbg !44 + %485 = extractvalue { i32, i32, i32, i32 } %484, 0, !dbg !44 + %486 = extractvalue { i32, i32, i32, i32 } %484, 1, !dbg !44 + %487 = extractvalue { i32, i32, i32, i32 } %484, 2, !dbg !44 + %488 = extractvalue { i32, i32, i32, i32 } %484, 3, !dbg !44 + %489 = insertelement <2 x i32> poison, i32 %485, i64 0, !dbg !44 + %490 = insertelement <2 x i32> %489, i32 %486, i64 1, !dbg !44 + %491 = lshr <2 x i32> %490, splat (i32 16), !dbg !44 + %492 = trunc nuw <2 x i32> %491 to <2 x i16>, !dbg !44 + %493 = insertelement <2 x i32> poison, i32 %487, i64 0, !dbg !44 + %494 = insertelement <2 x i32> %493, i32 %488, i64 1, !dbg !44 + %495 = lshr <2 x i32> %494, splat (i32 16), !dbg !44 + %496 = trunc nuw <2 x i32> %495 to <2 x i16>, !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %497 = trunc i32 %471 to i16, !dbg !45 + %498 = trunc i32 %472 to i16, !dbg !45 + %499 = insertelement <2 x i16> poison, i16 %497, i64 0, !dbg !45 + %500 = insertelement <2 x i16> %499, i16 %498, i64 1, !dbg !45 + store <2 x i16> %500, ptr addrspace(3) %98, align 4, !dbg !45 + %501 = trunc i32 %473 to i16, !dbg !45 + %502 = trunc i32 %474 to i16, !dbg !45 + %503 = insertelement <2 x i16> poison, i16 %501, i64 0, !dbg !45 + %504 = insertelement <2 x i16> %503, i16 %502, i64 1, !dbg !45 + store <2 x i16> %504, ptr addrspace(3) %103, align 4, !dbg !45 + store <2 x i16> %478, ptr addrspace(3) %109, align 4, !dbg !45 + store <2 x i16> %482, ptr addrspace(3) %110, align 4, !dbg !45 + %505 = trunc i32 %485 to i16, !dbg !45 + %506 = trunc i32 %486 to i16, !dbg !45 + %507 = insertelement <2 x i16> poison, i16 %505, i64 0, !dbg !45 + %508 = insertelement <2 x i16> %507, i16 %506, i64 1, !dbg !45 + store <2 x i16> %508, ptr addrspace(3) %112, align 4, !dbg !45 + %509 = trunc i32 %487 to i16, !dbg !45 + %510 = trunc i32 %488 to i16, !dbg !45 + %511 = insertelement <2 x i16> poison, i16 %509, i64 0, !dbg !45 + %512 = insertelement <2 x i16> %511, i16 %510, i64 1, !dbg !45 + store <2 x i16> %512, ptr addrspace(3) %117, align 4, !dbg !45 + store <2 x i16> %492, ptr addrspace(3) %123, align 4, !dbg !45 + store <2 x i16> %496, ptr addrspace(3) %124, align 4, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %513 = load bfloat, ptr addrspace(3) %136, align 2, !dbg !45 + %514 = load bfloat, ptr addrspace(3) %139, align 2, !dbg !45 + %515 = load bfloat, ptr addrspace(3) %142, align 2, !dbg !45 + %516 = load bfloat, ptr addrspace(3) %145, align 2, !dbg !45 + %517 = load bfloat, ptr addrspace(3) %148, align 2, !dbg !45 + %518 = load bfloat, ptr addrspace(3) %151, align 2, !dbg !45 + %519 = load bfloat, ptr addrspace(3) %154, align 2, !dbg !45 + %520 = load bfloat, ptr addrspace(3) %157, align 2, !dbg !45 + %521 = load bfloat, ptr addrspace(3) %160, align 2, !dbg !45 + %522 = load bfloat, ptr addrspace(3) %163, align 2, !dbg !45 + %523 = load bfloat, ptr addrspace(3) %166, align 2, !dbg !45 + %524 = load bfloat, ptr addrspace(3) %169, align 2, !dbg !45 + %525 = load bfloat, ptr addrspace(3) %172, align 2, !dbg !45 + %526 = load bfloat, ptr addrspace(3) %175, align 2, !dbg !45 + %527 = load bfloat, ptr addrspace(3) %178, align 2, !dbg !45 + %528 = load bfloat, ptr addrspace(3) %181, align 2, !dbg !45 + %529 = shl nsw i32 %38, 5, !dbg !46 + %530 = add nsw i32 %.decomposed109, -8192, !dbg !46 + %531 = add i32 %530, %529, !dbg !47 + %532 = sext i32 %531 to i64, !dbg !48 + %533 = getelementptr float, ptr addrspace(1) %4, i64 %532, !dbg !48 + %534 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !49 + %535 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %533, i64 %534, i1 %468) #5, !dbg !49 + %536 = extractvalue { i32, i32, i32, i32 } %535, 0, !dbg !49 + %537 = extractvalue { i32, i32, i32, i32 } %535, 1, !dbg !49 + %538 = extractvalue { i32, i32, i32, i32 } %535, 2, !dbg !49 + %539 = extractvalue { i32, i32, i32, i32 } %535, 3, !dbg !49 + %540 = bitcast i32 %536 to float, !dbg !49 + %541 = bitcast i32 %537 to float, !dbg !49 + %542 = bitcast i32 %538 to float, !dbg !49 + %543 = bitcast i32 %539 to float, !dbg !49 + %544 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !49 + %545 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %533, i64 %544, i1 %468) #5, !dbg !49 + %546 = extractvalue { i32, i32, i32, i32 } %545, 0, !dbg !49 + %547 = extractvalue { i32, i32, i32, i32 } %545, 1, !dbg !49 + %548 = extractvalue { i32, i32, i32, i32 } %545, 2, !dbg !49 + %549 = extractvalue { i32, i32, i32, i32 } %545, 3, !dbg !49 + %550 = bitcast i32 %546 to float, !dbg !49 + %551 = bitcast i32 %547 to float, !dbg !49 + %552 = bitcast i32 %548 to float, !dbg !49 + %553 = bitcast i32 %549 to float, !dbg !49 + %554 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !49 + %555 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %533, i64 %554, i1 %468) #5, !dbg !49 + %556 = extractvalue { i32, i32, i32, i32 } %555, 0, !dbg !49 + %557 = extractvalue { i32, i32, i32, i32 } %555, 1, !dbg !49 + %558 = extractvalue { i32, i32, i32, i32 } %555, 2, !dbg !49 + %559 = extractvalue { i32, i32, i32, i32 } %555, 3, !dbg !49 + %560 = bitcast i32 %556 to float, !dbg !49 + %561 = bitcast i32 %557 to float, !dbg !49 + %562 = bitcast i32 %558 to float, !dbg !49 + %563 = bitcast i32 %559 to float, !dbg !49 + %564 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !49 + %565 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %533, i64 %564, i1 %468) #5, !dbg !49 + %566 = extractvalue { i32, i32, i32, i32 } %565, 0, !dbg !49 + %567 = extractvalue { i32, i32, i32, i32 } %565, 1, !dbg !49 + %568 = extractvalue { i32, i32, i32, i32 } %565, 2, !dbg !49 + %569 = extractvalue { i32, i32, i32, i32 } %565, 3, !dbg !49 + %570 = bitcast i32 %566 to float, !dbg !49 + %571 = bitcast i32 %567 to float, !dbg !49 + %572 = bitcast i32 %568 to float, !dbg !49 + %573 = bitcast i32 %569 to float, !dbg !49 + %574 = tail call float @llvm.nvvm.div.full(float %540, float 1.280000e+02), !dbg !50 + %575 = tail call float @llvm.nvvm.div.full(float %541, float 1.280000e+02), !dbg !50 + %576 = tail call float @llvm.nvvm.div.full(float %542, float 1.280000e+02), !dbg !50 + %577 = tail call float @llvm.nvvm.div.full(float %543, float 1.280000e+02), !dbg !50 + %578 = tail call float @llvm.nvvm.div.full(float %550, float 1.280000e+02), !dbg !50 + %579 = tail call float @llvm.nvvm.div.full(float %551, float 1.280000e+02), !dbg !50 + %580 = tail call float @llvm.nvvm.div.full(float %552, float 1.280000e+02), !dbg !50 + %581 = tail call float @llvm.nvvm.div.full(float %553, float 1.280000e+02), !dbg !50 + %582 = tail call float @llvm.nvvm.div.full(float %560, float 1.280000e+02), !dbg !50 + %583 = tail call float @llvm.nvvm.div.full(float %561, float 1.280000e+02), !dbg !50 + %584 = tail call float @llvm.nvvm.div.full(float %562, float 1.280000e+02), !dbg !50 + %585 = tail call float @llvm.nvvm.div.full(float %563, float 1.280000e+02), !dbg !50 + %586 = tail call float @llvm.nvvm.div.full(float %570, float 1.280000e+02), !dbg !50 + %587 = tail call float @llvm.nvvm.div.full(float %571, float 1.280000e+02), !dbg !50 + %588 = tail call float @llvm.nvvm.div.full(float %572, float 1.280000e+02), !dbg !50 + %589 = tail call float @llvm.nvvm.div.full(float %573, float 1.280000e+02), !dbg !50 + %590 = fadd float %574, 0x3EB0C6F7A0000000, !dbg !51 + %591 = fadd float %575, 0x3EB0C6F7A0000000, !dbg !51 + %592 = fadd float %576, 0x3EB0C6F7A0000000, !dbg !51 + %593 = fadd float %577, 0x3EB0C6F7A0000000, !dbg !51 + %594 = fadd float %578, 0x3EB0C6F7A0000000, !dbg !51 + %595 = fadd float %579, 0x3EB0C6F7A0000000, !dbg !51 + %596 = fadd float %580, 0x3EB0C6F7A0000000, !dbg !51 + %597 = fadd float %581, 0x3EB0C6F7A0000000, !dbg !51 + %598 = fadd float %582, 0x3EB0C6F7A0000000, !dbg !51 + %599 = fadd float %583, 0x3EB0C6F7A0000000, !dbg !51 + %600 = fadd float %584, 0x3EB0C6F7A0000000, !dbg !51 + %601 = fadd float %585, 0x3EB0C6F7A0000000, !dbg !51 + %602 = fadd float %586, 0x3EB0C6F7A0000000, !dbg !51 + %603 = fadd float %587, 0x3EB0C6F7A0000000, !dbg !51 + %604 = fadd float %588, 0x3EB0C6F7A0000000, !dbg !51 + %605 = fadd float %589, 0x3EB0C6F7A0000000, !dbg !51 + %606 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i61 = icmp eq i32 %606, 0, !dbg !52 + br i1 %.not.i61, label %609, label %607, !dbg !52 + +607: ; preds = %__nv_rsqrtf.exit60 + %608 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %590), !dbg !52 + br label %__nv_rsqrtf.exit63, !dbg !52 + +609: ; preds = %__nv_rsqrtf.exit60 + %610 = tail call float @llvm.nvvm.rsqrt.approx.f(float %590), !dbg !52 + br label %__nv_rsqrtf.exit63, !dbg !52 + +__nv_rsqrtf.exit63: ; preds = %607, %609 + %.0.i62 = phi float [ %608, %607 ], [ %610, %609 ], !dbg !52 + %611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i64 = icmp eq i32 %611, 0, !dbg !52 + br i1 %.not.i64, label %614, label %612, !dbg !52 + +612: ; preds = %__nv_rsqrtf.exit63 + %613 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %591), !dbg !52 + br label %__nv_rsqrtf.exit66, !dbg !52 + +614: ; preds = %__nv_rsqrtf.exit63 + %615 = tail call float @llvm.nvvm.rsqrt.approx.f(float %591), !dbg !52 + br label %__nv_rsqrtf.exit66, !dbg !52 + +__nv_rsqrtf.exit66: ; preds = %612, %614 + %.0.i65 = phi float [ %613, %612 ], [ %615, %614 ], !dbg !52 + %616 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i67 = icmp eq i32 %616, 0, !dbg !52 + br i1 %.not.i67, label %619, label %617, !dbg !52 + +617: ; preds = %__nv_rsqrtf.exit66 + %618 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %592), !dbg !52 + br label %__nv_rsqrtf.exit69, !dbg !52 + +619: ; preds = %__nv_rsqrtf.exit66 + %620 = tail call float @llvm.nvvm.rsqrt.approx.f(float %592), !dbg !52 + br label %__nv_rsqrtf.exit69, !dbg !52 + +__nv_rsqrtf.exit69: ; preds = %617, %619 + %.0.i68 = phi float [ %618, %617 ], [ %620, %619 ], !dbg !52 + %621 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i70 = icmp eq i32 %621, 0, !dbg !52 + br i1 %.not.i70, label %624, label %622, !dbg !52 + +622: ; preds = %__nv_rsqrtf.exit69 + %623 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %593), !dbg !52 + br label %__nv_rsqrtf.exit72, !dbg !52 + +624: ; preds = %__nv_rsqrtf.exit69 + %625 = tail call float @llvm.nvvm.rsqrt.approx.f(float %593), !dbg !52 + br label %__nv_rsqrtf.exit72, !dbg !52 + +__nv_rsqrtf.exit72: ; preds = %622, %624 + %.0.i71 = phi float [ %623, %622 ], [ %625, %624 ], !dbg !52 + %626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i73 = icmp eq i32 %626, 0, !dbg !52 + br i1 %.not.i73, label %629, label %627, !dbg !52 + +627: ; preds = %__nv_rsqrtf.exit72 + %628 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %594), !dbg !52 + br label %__nv_rsqrtf.exit75, !dbg !52 + +629: ; preds = %__nv_rsqrtf.exit72 + %630 = tail call float @llvm.nvvm.rsqrt.approx.f(float %594), !dbg !52 + br label %__nv_rsqrtf.exit75, !dbg !52 + +__nv_rsqrtf.exit75: ; preds = %627, %629 + %.0.i74 = phi float [ %628, %627 ], [ %630, %629 ], !dbg !52 + %631 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i76 = icmp eq i32 %631, 0, !dbg !52 + br i1 %.not.i76, label %634, label %632, !dbg !52 + +632: ; preds = %__nv_rsqrtf.exit75 + %633 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %595), !dbg !52 + br label %__nv_rsqrtf.exit78, !dbg !52 + +634: ; preds = %__nv_rsqrtf.exit75 + %635 = tail call float @llvm.nvvm.rsqrt.approx.f(float %595), !dbg !52 + br label %__nv_rsqrtf.exit78, !dbg !52 + +__nv_rsqrtf.exit78: ; preds = %632, %634 + %.0.i77 = phi float [ %633, %632 ], [ %635, %634 ], !dbg !52 + %636 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i79 = icmp eq i32 %636, 0, !dbg !52 + br i1 %.not.i79, label %639, label %637, !dbg !52 + +637: ; preds = %__nv_rsqrtf.exit78 + %638 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %596), !dbg !52 + br label %__nv_rsqrtf.exit81, !dbg !52 + +639: ; preds = %__nv_rsqrtf.exit78 + %640 = tail call float @llvm.nvvm.rsqrt.approx.f(float %596), !dbg !52 + br label %__nv_rsqrtf.exit81, !dbg !52 + +__nv_rsqrtf.exit81: ; preds = %637, %639 + %.0.i80 = phi float [ %638, %637 ], [ %640, %639 ], !dbg !52 + %641 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i82 = icmp eq i32 %641, 0, !dbg !52 + br i1 %.not.i82, label %644, label %642, !dbg !52 + +642: ; preds = %__nv_rsqrtf.exit81 + %643 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %597), !dbg !52 + br label %__nv_rsqrtf.exit84, !dbg !52 + +644: ; preds = %__nv_rsqrtf.exit81 + %645 = tail call float @llvm.nvvm.rsqrt.approx.f(float %597), !dbg !52 + br label %__nv_rsqrtf.exit84, !dbg !52 + +__nv_rsqrtf.exit84: ; preds = %642, %644 + %.0.i83 = phi float [ %643, %642 ], [ %645, %644 ], !dbg !52 + %646 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i85 = icmp eq i32 %646, 0, !dbg !52 + br i1 %.not.i85, label %649, label %647, !dbg !52 + +647: ; preds = %__nv_rsqrtf.exit84 + %648 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %598), !dbg !52 + br label %__nv_rsqrtf.exit87, !dbg !52 + +649: ; preds = %__nv_rsqrtf.exit84 + %650 = tail call float @llvm.nvvm.rsqrt.approx.f(float %598), !dbg !52 + br label %__nv_rsqrtf.exit87, !dbg !52 + +__nv_rsqrtf.exit87: ; preds = %647, %649 + %.0.i86 = phi float [ %648, %647 ], [ %650, %649 ], !dbg !52 + %651 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i88 = icmp eq i32 %651, 0, !dbg !52 + br i1 %.not.i88, label %654, label %652, !dbg !52 + +652: ; preds = %__nv_rsqrtf.exit87 + %653 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %599), !dbg !52 + br label %__nv_rsqrtf.exit90, !dbg !52 + +654: ; preds = %__nv_rsqrtf.exit87 + %655 = tail call float @llvm.nvvm.rsqrt.approx.f(float %599), !dbg !52 + br label %__nv_rsqrtf.exit90, !dbg !52 + +__nv_rsqrtf.exit90: ; preds = %652, %654 + %.0.i89 = phi float [ %653, %652 ], [ %655, %654 ], !dbg !52 + %656 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i91 = icmp eq i32 %656, 0, !dbg !52 + br i1 %.not.i91, label %659, label %657, !dbg !52 + +657: ; preds = %__nv_rsqrtf.exit90 + %658 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %600), !dbg !52 + br label %__nv_rsqrtf.exit93, !dbg !52 + +659: ; preds = %__nv_rsqrtf.exit90 + %660 = tail call float @llvm.nvvm.rsqrt.approx.f(float %600), !dbg !52 + br label %__nv_rsqrtf.exit93, !dbg !52 + +__nv_rsqrtf.exit93: ; preds = %657, %659 + %.0.i92 = phi float [ %658, %657 ], [ %660, %659 ], !dbg !52 + %661 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i94 = icmp eq i32 %661, 0, !dbg !52 + br i1 %.not.i94, label %664, label %662, !dbg !52 + +662: ; preds = %__nv_rsqrtf.exit93 + %663 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %601), !dbg !52 + br label %__nv_rsqrtf.exit96, !dbg !52 + +664: ; preds = %__nv_rsqrtf.exit93 + %665 = tail call float @llvm.nvvm.rsqrt.approx.f(float %601), !dbg !52 + br label %__nv_rsqrtf.exit96, !dbg !52 + +__nv_rsqrtf.exit96: ; preds = %662, %664 + %.0.i95 = phi float [ %663, %662 ], [ %665, %664 ], !dbg !52 + %666 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i97 = icmp eq i32 %666, 0, !dbg !52 + br i1 %.not.i97, label %669, label %667, !dbg !52 + +667: ; preds = %__nv_rsqrtf.exit96 + %668 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %602), !dbg !52 + br label %__nv_rsqrtf.exit99, !dbg !52 + +669: ; preds = %__nv_rsqrtf.exit96 + %670 = tail call float @llvm.nvvm.rsqrt.approx.f(float %602), !dbg !52 + br label %__nv_rsqrtf.exit99, !dbg !52 + +__nv_rsqrtf.exit99: ; preds = %667, %669 + %.0.i98 = phi float [ %668, %667 ], [ %670, %669 ], !dbg !52 + %671 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i100 = icmp eq i32 %671, 0, !dbg !52 + br i1 %.not.i100, label %674, label %672, !dbg !52 + +672: ; preds = %__nv_rsqrtf.exit99 + %673 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %603), !dbg !52 + br label %__nv_rsqrtf.exit102, !dbg !52 + +674: ; preds = %__nv_rsqrtf.exit99 + %675 = tail call float @llvm.nvvm.rsqrt.approx.f(float %603), !dbg !52 + br label %__nv_rsqrtf.exit102, !dbg !52 + +__nv_rsqrtf.exit102: ; preds = %672, %674 + %.0.i101 = phi float [ %673, %672 ], [ %675, %674 ], !dbg !52 + %676 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i103 = icmp eq i32 %676, 0, !dbg !52 + br i1 %.not.i103, label %679, label %677, !dbg !52 + +677: ; preds = %__nv_rsqrtf.exit102 + %678 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %604), !dbg !52 + br label %__nv_rsqrtf.exit105, !dbg !52 + +679: ; preds = %__nv_rsqrtf.exit102 + %680 = tail call float @llvm.nvvm.rsqrt.approx.f(float %604), !dbg !52 + br label %__nv_rsqrtf.exit105, !dbg !52 + +__nv_rsqrtf.exit105: ; preds = %677, %679 + %.0.i104 = phi float [ %678, %677 ], [ %680, %679 ], !dbg !52 + %681 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52 + %.not.i106 = icmp eq i32 %681, 0, !dbg !52 + br i1 %.not.i106, label %684, label %682, !dbg !52 + +682: ; preds = %__nv_rsqrtf.exit105 + %683 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %605), !dbg !52 + br label %__nv_rsqrtf.exit108, !dbg !52 + +684: ; preds = %__nv_rsqrtf.exit105 + %685 = tail call float @llvm.nvvm.rsqrt.approx.f(float %605), !dbg !52 + br label %__nv_rsqrtf.exit108, !dbg !52 + +__nv_rsqrtf.exit108: ; preds = %682, %684 + %.0.i107 = phi float [ %683, %682 ], [ %685, %684 ], !dbg !52 + %686 = icmp slt i32 %23, 73728, !dbg !53 + %687 = icmp slt i32 %24, 8192, !dbg !23 + %688 = fpext bfloat %528 to float, !dbg !45 + %689 = fpext bfloat %527 to float, !dbg !45 + %690 = fpext bfloat %526 to float, !dbg !45 + %691 = fpext bfloat %525 to float, !dbg !45 + %692 = fpext bfloat %524 to float, !dbg !45 + %693 = fpext bfloat %523 to float, !dbg !45 + %694 = fpext bfloat %522 to float, !dbg !45 + %695 = fpext bfloat %521 to float, !dbg !45 + %696 = fpext bfloat %520 to float, !dbg !45 + %697 = fpext bfloat %519 to float, !dbg !45 + %698 = fpext bfloat %518 to float, !dbg !45 + %699 = fpext bfloat %517 to float, !dbg !45 + %700 = fpext bfloat %516 to float, !dbg !45 + %701 = fpext bfloat %515 to float, !dbg !45 + %702 = fpext bfloat %514 to float, !dbg !45 + %703 = fpext bfloat %513 to float, !dbg !45 + %704 = extractvalue { i32, i32, i32, i32 } %455, 3, !dbg !40 + %705 = bitcast i32 %704 to <2 x bfloat>, !dbg !40 + %706 = extractvalue { i32, i32, i32, i32 } %455, 2, !dbg !40 + %707 = bitcast i32 %706 to <2 x bfloat>, !dbg !40 + %708 = extractvalue { i32, i32, i32, i32 } %455, 1, !dbg !40 + %709 = bitcast i32 %708 to <2 x bfloat>, !dbg !40 + %710 = extractvalue { i32, i32, i32, i32 } %455, 0, !dbg !40 + %711 = bitcast i32 %710 to <2 x bfloat>, !dbg !40 + %712 = extractvalue { i32, i32, i32, i32 } %453, 3, !dbg !40 + %713 = bitcast i32 %712 to <2 x bfloat>, !dbg !40 + %714 = extractvalue { i32, i32, i32, i32 } %453, 2, !dbg !40 + %715 = bitcast i32 %714 to <2 x bfloat>, !dbg !40 + %716 = extractvalue { i32, i32, i32, i32 } %453, 1, !dbg !40 + %717 = bitcast i32 %716 to <2 x bfloat>, !dbg !40 + %718 = extractvalue { i32, i32, i32, i32 } %453, 0, !dbg !40 + %719 = bitcast i32 %718 to <2 x bfloat>, !dbg !40 + %720 = fmul float %.0.i62, %703, !dbg !54 + %721 = fmul float %.0.i65, %702, !dbg !54 + %722 = fmul float %.0.i68, %701, !dbg !54 + %723 = fmul float %.0.i71, %700, !dbg !54 + %724 = fmul float %.0.i74, %699, !dbg !54 + %725 = fmul float %.0.i77, %698, !dbg !54 + %726 = fmul float %.0.i80, %697, !dbg !54 + %727 = fmul float %.0.i83, %696, !dbg !54 + %728 = fmul float %.0.i86, %695, !dbg !54 + %729 = fmul float %.0.i89, %694, !dbg !54 + %730 = fmul float %.0.i92, %693, !dbg !54 + %731 = fmul float %.0.i95, %692, !dbg !54 + %732 = fmul float %.0.i98, %691, !dbg !54 + %733 = fmul float %.0.i101, %690, !dbg !54 + %734 = fmul float %.0.i104, %689, !dbg !54 + %735 = fmul float %.0.i107, %688, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + store float %720, ptr addrspace(3) %376, align 4, !dbg !54 + store float %721, ptr addrspace(3) %378, align 4, !dbg !54 + store float %722, ptr addrspace(3) %380, align 4, !dbg !54 + store float %723, ptr addrspace(3) %382, align 4, !dbg !54 + store float %724, ptr addrspace(3) %384, align 4, !dbg !54 + store float %725, ptr addrspace(3) %386, align 4, !dbg !54 + store float %726, ptr addrspace(3) %388, align 4, !dbg !54 + store float %727, ptr addrspace(3) %390, align 4, !dbg !54 + store float %728, ptr addrspace(3) %392, align 4, !dbg !54 + store float %729, ptr addrspace(3) %394, align 4, !dbg !54 + store float %730, ptr addrspace(3) %396, align 4, !dbg !54 + store float %731, ptr addrspace(3) %398, align 4, !dbg !54 + store float %732, ptr addrspace(3) %400, align 4, !dbg !54 + store float %733, ptr addrspace(3) %402, align 4, !dbg !54 + store float %734, ptr addrspace(3) %404, align 4, !dbg !54 + store float %735, ptr addrspace(3) %406, align 4, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %736 = load float, ptr addrspace(3) %415, align 4, !dbg !54 + %737 = load float, ptr addrspace(3) %417, align 4, !dbg !54 + %738 = load float, ptr addrspace(3) %419, align 4, !dbg !54 + %739 = load float, ptr addrspace(3) %421, align 4, !dbg !54 + %740 = load float, ptr addrspace(3) %424, align 4, !dbg !54 + %741 = load float, ptr addrspace(3) %426, align 4, !dbg !54 + %742 = load float, ptr addrspace(3) %428, align 4, !dbg !54 + %743 = load float, ptr addrspace(3) %430, align 4, !dbg !54 + %744 = load float, ptr addrspace(3) %433, align 4, !dbg !54 + %745 = load float, ptr addrspace(3) %435, align 4, !dbg !54 + %746 = load float, ptr addrspace(3) %437, align 4, !dbg !54 + %747 = load float, ptr addrspace(3) %439, align 4, !dbg !54 + %748 = load float, ptr addrspace(3) %442, align 4, !dbg !54 + %749 = load float, ptr addrspace(3) %444, align 4, !dbg !54 + %750 = load float, ptr addrspace(3) %446, align 4, !dbg !54 + %751 = load float, ptr addrspace(3) %448, align 4, !dbg !54 + %752 = getelementptr bfloat, ptr addrspace(1) %5, i64 %450, !dbg !55 + %753 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !56 + %754 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %752, i64 %753, i1 %464) #5, !dbg !56 + %755 = extractvalue { i32, i32, i32, i32 } %754, 0, !dbg !56 + %756 = bitcast i32 %755 to <2 x bfloat>, !dbg !56 + %757 = extractvalue { i32, i32, i32, i32 } %754, 1, !dbg !56 + %758 = bitcast i32 %757 to <2 x bfloat>, !dbg !56 + %759 = extractvalue { i32, i32, i32, i32 } %754, 2, !dbg !56 + %760 = bitcast i32 %759 to <2 x bfloat>, !dbg !56 + %761 = extractvalue { i32, i32, i32, i32 } %754, 3, !dbg !56 + %762 = bitcast i32 %761 to <2 x bfloat>, !dbg !56 + %763 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !56 + %764 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %752, i64 %763, i1 %467) #5, !dbg !56 + %765 = extractvalue { i32, i32, i32, i32 } %764, 0, !dbg !56 + %766 = bitcast i32 %765 to <2 x bfloat>, !dbg !56 + %767 = extractvalue { i32, i32, i32, i32 } %764, 1, !dbg !56 + %768 = bitcast i32 %767 to <2 x bfloat>, !dbg !56 + %769 = extractvalue { i32, i32, i32, i32 } %764, 2, !dbg !56 + %770 = bitcast i32 %769 to <2 x bfloat>, !dbg !56 + %771 = extractvalue { i32, i32, i32, i32 } %764, 3, !dbg !56 + %772 = bitcast i32 %771 to <2 x bfloat>, !dbg !56 + %773 = shl i32 %23, 7, !dbg !57 + %774 = shl i32 %24, 7, !dbg !57 + %775 = add i32 %773, %32, !dbg !58 + %776 = add i32 %774, %32, !dbg !58 + %777 = sext i32 %775 to i64, !dbg !59 + %778 = getelementptr bfloat, ptr addrspace(1) %6, i64 %777, !dbg !59 + %779 = sext i32 %776 to i64, !dbg !59 + %780 = getelementptr bfloat, ptr addrspace(1) %6, i64 %779, !dbg !59 + %781 = and i1 %34, %686, !dbg !60 + %782 = fpext <2 x bfloat> %719 to <2 x float>, !dbg !61 + %783 = insertelement <2 x float> poison, float %416, i64 0, !dbg !62 + %784 = insertelement <2 x float> %783, float %425, i64 1, !dbg !62 + %785 = fmul <2 x float> %784, %782, !dbg !62 + %786 = fpext <2 x bfloat> %756 to <2 x float>, !dbg !63 + %787 = insertelement <2 x float> poison, float %736, i64 0, !dbg !64 + %788 = insertelement <2 x float> %787, float %740, i64 1, !dbg !64 + %789 = fmul <2 x float> %788, %786, !dbg !64 + %790 = insertelement <2 x i1> poison, i1 %41, i64 0, !dbg !65 + %791 = shufflevector <2 x i1> %790, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !65 + %792 = select <2 x i1> %791, <2 x float> %785, <2 x float> %789, !dbg !65 + %793 = fptrunc <2 x float> %792 to <2 x bfloat>, !dbg !66 + %794 = fpext <2 x bfloat> %717 to <2 x float>, !dbg !61 + %795 = insertelement <2 x float> poison, float %418, i64 0, !dbg !62 + %796 = insertelement <2 x float> %795, float %427, i64 1, !dbg !62 + %797 = fmul <2 x float> %796, %794, !dbg !62 + %798 = fpext <2 x bfloat> %758 to <2 x float>, !dbg !63 + %799 = insertelement <2 x float> poison, float %737, i64 0, !dbg !64 + %800 = insertelement <2 x float> %799, float %741, i64 1, !dbg !64 + %801 = fmul <2 x float> %800, %798, !dbg !64 + %802 = select <2 x i1> %791, <2 x float> %797, <2 x float> %801, !dbg !65 + %803 = fptrunc <2 x float> %802 to <2 x bfloat>, !dbg !66 + %804 = fpext <2 x bfloat> %715 to <2 x float>, !dbg !61 + %805 = insertelement <2 x float> poison, float %420, i64 0, !dbg !62 + %806 = insertelement <2 x float> %805, float %429, i64 1, !dbg !62 + %807 = fmul <2 x float> %806, %804, !dbg !62 + %808 = fpext <2 x bfloat> %760 to <2 x float>, !dbg !63 + %809 = insertelement <2 x float> poison, float %738, i64 0, !dbg !64 + %810 = insertelement <2 x float> %809, float %742, i64 1, !dbg !64 + %811 = fmul <2 x float> %810, %808, !dbg !64 + %812 = select <2 x i1> %791, <2 x float> %807, <2 x float> %811, !dbg !65 + %813 = fptrunc <2 x float> %812 to <2 x bfloat>, !dbg !66 + %814 = fpext <2 x bfloat> %713 to <2 x float>, !dbg !61 + %815 = insertelement <2 x float> poison, float %422, i64 0, !dbg !62 + %816 = insertelement <2 x float> %815, float %431, i64 1, !dbg !62 + %817 = fmul <2 x float> %816, %814, !dbg !62 + %818 = fpext <2 x bfloat> %762 to <2 x float>, !dbg !63 + %819 = insertelement <2 x float> poison, float %739, i64 0, !dbg !64 + %820 = insertelement <2 x float> %819, float %743, i64 1, !dbg !64 + %821 = fmul <2 x float> %820, %818, !dbg !64 + %822 = select <2 x i1> %791, <2 x float> %817, <2 x float> %821, !dbg !65 + %823 = fptrunc <2 x float> %822 to <2 x bfloat>, !dbg !66 + %824 = fpext <2 x bfloat> %711 to <2 x float>, !dbg !61 + %825 = insertelement <2 x float> poison, float %434, i64 0, !dbg !62 + %826 = insertelement <2 x float> %825, float %443, i64 1, !dbg !62 + %827 = fmul <2 x float> %826, %824, !dbg !62 + %828 = fpext <2 x bfloat> %766 to <2 x float>, !dbg !63 + %829 = insertelement <2 x float> poison, float %744, i64 0, !dbg !64 + %830 = insertelement <2 x float> %829, float %748, i64 1, !dbg !64 + %831 = fmul <2 x float> %830, %828, !dbg !64 + %832 = insertelement <2 x i1> poison, i1 %687, i64 0, !dbg !65 + %833 = shufflevector <2 x i1> %832, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !65 + %834 = select <2 x i1> %833, <2 x float> %827, <2 x float> %831, !dbg !65 + %835 = fptrunc <2 x float> %834 to <2 x bfloat>, !dbg !66 + %836 = fpext <2 x bfloat> %709 to <2 x float>, !dbg !61 + %837 = insertelement <2 x float> poison, float %436, i64 0, !dbg !62 + %838 = insertelement <2 x float> %837, float %445, i64 1, !dbg !62 + %839 = fmul <2 x float> %838, %836, !dbg !62 + %840 = fpext <2 x bfloat> %768 to <2 x float>, !dbg !63 + %841 = insertelement <2 x float> poison, float %745, i64 0, !dbg !64 + %842 = insertelement <2 x float> %841, float %749, i64 1, !dbg !64 + %843 = fmul <2 x float> %842, %840, !dbg !64 + %844 = select <2 x i1> %833, <2 x float> %839, <2 x float> %843, !dbg !65 + %845 = fptrunc <2 x float> %844 to <2 x bfloat>, !dbg !66 + %846 = fpext <2 x bfloat> %707 to <2 x float>, !dbg !61 + %847 = insertelement <2 x float> poison, float %438, i64 0, !dbg !62 + %848 = insertelement <2 x float> %847, float %447, i64 1, !dbg !62 + %849 = fmul <2 x float> %848, %846, !dbg !62 + %850 = fpext <2 x bfloat> %770 to <2 x float>, !dbg !63 + %851 = insertelement <2 x float> poison, float %746, i64 0, !dbg !64 + %852 = insertelement <2 x float> %851, float %750, i64 1, !dbg !64 + %853 = fmul <2 x float> %852, %850, !dbg !64 + %854 = select <2 x i1> %833, <2 x float> %849, <2 x float> %853, !dbg !65 + %855 = fptrunc <2 x float> %854 to <2 x bfloat>, !dbg !66 + %856 = fpext <2 x bfloat> %705 to <2 x float>, !dbg !61 + %857 = insertelement <2 x float> poison, float %440, i64 0, !dbg !62 + %858 = insertelement <2 x float> %857, float %449, i64 1, !dbg !62 + %859 = fmul <2 x float> %858, %856, !dbg !62 + %860 = fpext <2 x bfloat> %772 to <2 x float>, !dbg !63 + %861 = insertelement <2 x float> poison, float %747, i64 0, !dbg !64 + %862 = insertelement <2 x float> %861, float %751, i64 1, !dbg !64 + %863 = fmul <2 x float> %862, %860, !dbg !64 + %864 = select <2 x i1> %833, <2 x float> %859, <2 x float> %863, !dbg !65 + %865 = fptrunc <2 x float> %864 to <2 x bfloat>, !dbg !66 + %866 = bitcast <2 x bfloat> %793 to i32, !dbg !66 + %867 = bitcast <2 x bfloat> %803 to i32, !dbg !66 + %868 = bitcast <2 x bfloat> %813 to i32, !dbg !66 + %869 = bitcast <2 x bfloat> %823 to i32, !dbg !66 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %866, i32 %867, i32 %868, i32 %869, ptr addrspace(1) %778, i1 %781) #5, !dbg !66 + %870 = bitcast <2 x bfloat> %835 to i32, !dbg !66 + %871 = bitcast <2 x bfloat> %845 to i32, !dbg !66 + %872 = bitcast <2 x bfloat> %855 to i32, !dbg !66 + %873 = bitcast <2 x bfloat> %865 to i32, !dbg !66 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %870, i32 %871, i32 %872, i32 %873, ptr addrspace(1) %780, i1 %781) #5, !dbg !66 + ret void, !dbg !67 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #3 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 29, scope: !5) +!9 = !DILocation(line: 21, column: 48, scope: !5) +!10 = !DILocation(line: 21, column: 69, scope: !5) +!11 = !DILocation(line: 21, column: 53, scope: !5) +!12 = !DILocation(line: 21, column: 34, scope: !5) +!13 = !DILocation(line: 21, column: 75, scope: !5) +!14 = !DILocation(line: 22, column: 44, scope: !5) +!15 = !DILocation(line: 22, column: 23, scope: !5) +!16 = !DILocation(line: 24, column: 28, scope: !5) +!17 = !DILocation(line: 24, column: 33, scope: !5) +!18 = !DILocation(line: 25, column: 44, scope: !5) +!19 = !DILocation(line: 25, column: 23, scope: !5) +!20 = !DILocation(line: 26, column: 21, scope: !5) +!21 = !DILocation(line: 27, column: 19, scope: !5) +!22 = !DILocation(line: 29, column: 19, scope: !5) +!23 = !DILocation(line: 35, column: 18, scope: !5) +!24 = !DILocation(line: 36, column: 39, scope: !5) +!25 = !DILocation(line: 36, column: 35, scope: !5) +!26 = !DILocation(line: 36, column: 51, scope: !5) +!27 = !DILocation(line: 36, column: 44, scope: !5) +!28 = !DILocation(line: 36, column: 30, scope: !5) +!29 = !DILocation(line: 36, column: 64, scope: !5) +!30 = !DILocation(line: 36, column: 72, scope: !5) +!31 = !DILocation(line: 36, column: 57, scope: !5) +!32 = !DILocation(line: 36, column: 123, scope: !5) +!33 = !DILocation(line: 38, column: 30, scope: !5) +!34 = !DILocation(line: 38, column: 80, scope: !5) +!35 = !DILocation(line: 40, column: 19, scope: !5) +!36 = !DILocation(line: 42, column: 19, scope: !5) +!37 = !DILocation(line: 43, column: 28, scope: !5) +!38 = !DILocation(line: 44, column: 19, scope: !5) +!39 = !DILocation(line: 45, column: 31, scope: !5) +!40 = !DILocation(line: 45, column: 71, scope: !5) +!41 = !DILocation(line: 54, column: 45, scope: !5) +!42 = !DILocation(line: 54, column: 31, scope: !5) +!43 = !DILocation(line: 54, column: 83, scope: !5) +!44 = !DILocation(line: 54, column: 67, scope: !5) +!45 = !DILocation(line: 54, column: 134, scope: !5) +!46 = !DILocation(line: 56, column: 56, scope: !5) +!47 = !DILocation(line: 56, column: 52, scope: !5) +!48 = !DILocation(line: 56, column: 31, scope: !5) +!49 = !DILocation(line: 56, column: 90, scope: !5) +!50 = !DILocation(line: 58, column: 21, scope: !5) +!51 = !DILocation(line: 60, column: 20, scope: !5) +!52 = !DILocation(line: 61, column: 28, scope: !5) +!53 = !DILocation(line: 23, column: 21, scope: !5) +!54 = !DILocation(line: 62, column: 20, scope: !5) +!55 = !DILocation(line: 63, column: 31, scope: !5) +!56 = !DILocation(line: 63, column: 71, scope: !5) +!57 = !DILocation(line: 70, column: 34, scope: !5) +!58 = !DILocation(line: 70, column: 30, scope: !5) +!59 = !DILocation(line: 70, column: 25, scope: !5) +!60 = !DILocation(line: 70, column: 54, scope: !5) +!61 = !DILocation(line: 45, column: 137, scope: !5) +!62 = !DILocation(line: 47, column: 20, scope: !5) +!63 = !DILocation(line: 63, column: 138, scope: !5) +!64 = !DILocation(line: 65, column: 20, scope: !5) +!65 = !DILocation(line: 0, scope: !5) +!66 = !DILocation(line: 70, column: 46, scope: !5) +!67 = !DILocation(line: 70, column: 4, scope: !5) diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..2f54e7f201a63fcb6f52e7a2e5d62b81556c6ed4 --- /dev/null +++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx @@ -0,0 +1,1160 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_poi_fused__fused_rms_norm_cat_view_2 +.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2( + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10 +) +.reqntid 256 +{ + .reg .pred %p<17>; + .reg .b16 %rs<65>; + .reg .b32 %r<520>; + .reg .b64 %rd<35>; + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd27, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0]; + ld.param.b64 %rd28, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1]; +$L__tmp0: + .loc 1 21 29 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29 + mov.u32 %r74, %ctaid.y; + ld.param.b64 %rd29, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2]; + .loc 1 21 48 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48 + mov.u32 %r75, %ctaid.z; + ld.param.b64 %rd30, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3]; + .loc 1 21 69 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69 + mov.u32 %r76, %nctaid.y; + ld.param.b64 %rd31, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4]; + .loc 1 21 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34 + mad.lo.s32 %r77, %r75, %r76, %r74; + ld.param.b64 %rd32, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5]; + .loc 1 21 75 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75 + shl.b32 %r78, %r77, 6; + ld.param.b64 %rd33, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6]; + .loc 1 22 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44 + mov.u32 %r79, %tid.x; + bfe.u32 %r80, %r79, 3, 5; + shl.b32 %r81, %r79, 2; + and.b32 %r82, %r81, 60; + .loc 1 22 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23 + or.b32 %r83, %r78, %r80; + or.b32 %r84, %r83, 32; + or.b32 %r85, %r78, %r82; + .loc 1 24 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28 + mov.u32 %r86, %ctaid.x; + .loc 1 24 33 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:33 + shl.b32 %r87, %r86, 6; + .loc 1 25 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:44 + and.b32 %r88, %r79, 7; + shl.b32 %r89, %r88, 3; + shr.u32 %r90, %r79, 4; + bfe.u32 %r91, %r79, 4, 4; + .loc 1 25 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:23 + or.b32 %r92, %r89, %r87; + or.b32 %r93, %r91, %r87; + .loc 1 26 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21 + setp.lt.s32 %p8, %r92, 128; + setp.lt.s32 %p9, %r93, 128; + .loc 1 27 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19 + bfe.s32 %r94, %r77, 25, 1; + shr.u32 %r95, %r94, 27; + add.s32 %r96, %r83, %r95; + shr.u32 %r97, %r96, 5; + add.s32 %r98, %r84, %r95; + shr.u32 %r99, %r98, 5; + .loc 1 29 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19 + and.b32 %r100, %r96, 33554400; + sub.s32 %r101, %r83, %r100; + .loc 1 35 18 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18 + setp.lt.s32 %p10, %r83, 8192; + setp.lt.s32 %p11, %r85, 8192; + .loc 1 36 39 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39 + shl.b32 %r102, %r101, 7; + .loc 1 36 35 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35 + add.s32 %r103, %r102, %r92; + .loc 1 36 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44 + mad.lo.s32 %r104, %r97, 12288, %r103; + mad.lo.s32 %r105, %r99, 12288, %r103; + .loc 1 36 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30 + mad.wide.s32 %rd1, %r104, 2, %rd27; + mad.wide.s32 %rd3, %r105, 2, %rd27; + .loc 1 36 64 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64 + and.pred %p1, %p8, %p10; + and.pred %p3, %p9, %p11; + .loc 1 36 72 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:72 + setp.lt.s32 %p12, %r83, 8160; + and.pred %p2, %p8, %p12; + .loc 1 36 57 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + prmt.b32 %r106, %r1, %r2, 0x7632U; + prmt.b32 %r107, %r3, %r4, 0x7632U; + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4; + // end inline asm + prmt.b32 %r108, %r6, %r7, 0x7632U; + prmt.b32 %r109, %r8, %r9, 0x7632U; + .loc 1 36 123 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123 + and.b32 %r110, %r79, 24; + shl.b32 %r111, %r110, 5; + shl.b32 %r112, %r88, 4; + shr.u32 %r113, %r110, 1; + and.b32 %r114, %r79, 96; + shr.u32 %r115, %r114, 3; + and.b32 %r116, %r79, 128; + bfe.s32 %r117, %r79, 7, 1; + and.b32 %r118, %r117, 1040; + xor.b32 %r119, %r113, %r115; + or.b32 %r120, %r119, %r111; + or.b32 %r121, %r120, %r112; + xor.b32 %r122, %r121, %r118; + mov.b32 %r123, global_smem; + add.s32 %r124, %r123, %r122; + prmt.b32 %r125, %r1, %r2, 0x5410U; + st.shared.b32 [%r124], %r125; + prmt.b32 %r126, %r3, %r4, 0x5410U; + st.shared.b32 [%r124+128], %r126; + xor.b32 %r127, %r122, 64; + add.s32 %r128, %r123, %r127; + st.shared.b32 [%r128+4096], %r106; + st.shared.b32 [%r128+4224], %r107; + xor.b32 %r129, %r122, 32; + add.s32 %r130, %r123, %r129; + prmt.b32 %r131, %r6, %r7, 0x5410U; + st.shared.b32 [%r130+2048], %r131; + prmt.b32 %r132, %r8, %r9, 0x5410U; + st.shared.b32 [%r130+2176], %r132; + xor.b32 %r133, %r122, 96; + add.s32 %r134, %r123, %r133; + st.shared.b32 [%r134+6144], %r108; + st.shared.b32 [%r134+6272], %r109; + bar.sync 0; + and.b32 %r135, %r79, 28; + shl.b32 %r136, %r135, 8; + and.b32 %r137, %r81, 124; + and.b32 %r138, %r90, 2; + shl.b32 %r139, %r79, 1; + and.b32 %r140, %r139, 128; + shr.u32 %r141, %r116, 3; + or.b32 %r142, %r138, %r140; + or.b32 %r143, %r136, %r137; + xor.b32 %r144, %r143, %r141; + or.b32 %r145, %r142, %r144; + add.s32 %r146, %r123, %r145; + ld.shared.b16 %rs1, [%r146]; + xor.b32 %r147, %r145, 4; + add.s32 %r148, %r123, %r147; + ld.shared.b16 %rs2, [%r148+256]; + xor.b32 %r149, %r145, 8; + add.s32 %r150, %r123, %r149; + ld.shared.b16 %rs3, [%r150+512]; + xor.b32 %r151, %r145, 12; + add.s32 %r152, %r123, %r151; + ld.shared.b16 %rs4, [%r152+768]; + xor.b32 %r153, %r145, 32; + add.s32 %r154, %r123, %r153; + ld.shared.b16 %rs5, [%r154]; + xor.b32 %r155, %r145, 36; + add.s32 %r156, %r123, %r155; + ld.shared.b16 %rs6, [%r156+256]; + xor.b32 %r157, %r145, 40; + add.s32 %r158, %r123, %r157; + ld.shared.b16 %rs7, [%r158+512]; + xor.b32 %r159, %r145, 44; + add.s32 %r160, %r123, %r159; + ld.shared.b16 %rs8, [%r160+768]; + xor.b32 %r161, %r145, 64; + add.s32 %r162, %r123, %r161; + ld.shared.b16 %rs9, [%r162]; + xor.b32 %r163, %r145, 68; + add.s32 %r164, %r123, %r163; + ld.shared.b16 %rs10, [%r164+256]; + xor.b32 %r165, %r145, 72; + add.s32 %r166, %r123, %r165; + ld.shared.b16 %rs11, [%r166+512]; + xor.b32 %r167, %r145, 76; + add.s32 %r168, %r123, %r167; + ld.shared.b16 %rs12, [%r168+768]; + xor.b32 %r169, %r145, 96; + add.s32 %r170, %r123, %r169; + ld.shared.b16 %rs13, [%r170]; + xor.b32 %r171, %r145, 100; + add.s32 %r172, %r123, %r171; + ld.shared.b16 %rs14, [%r172+256]; + xor.b32 %r173, %r145, 104; + add.s32 %r174, %r123, %r173; + ld.shared.b16 %rs15, [%r174+512]; + xor.b32 %r175, %r145, 108; + add.s32 %r176, %r123, %r175; + ld.shared.b16 %rs16, [%r176+768]; + cvt.f32.bf16 %r177, %rs1; + cvt.f32.bf16 %r178, %rs2; + cvt.f32.bf16 %r179, %rs3; + cvt.f32.bf16 %r180, %rs4; + cvt.f32.bf16 %r181, %rs5; + cvt.f32.bf16 %r182, %rs6; + cvt.f32.bf16 %r183, %rs7; + cvt.f32.bf16 %r184, %rs8; + cvt.f32.bf16 %r185, %rs9; + cvt.f32.bf16 %r186, %rs10; + cvt.f32.bf16 %r187, %rs11; + cvt.f32.bf16 %r188, %rs12; + cvt.f32.bf16 %r189, %rs13; + cvt.f32.bf16 %r190, %rs14; + cvt.f32.bf16 %r191, %rs15; + cvt.f32.bf16 %r192, %rs16; + .loc 1 38 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30 + mad.wide.s32 %rd5, %r85, 4, %rd28; + .loc 1 38 80 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + mov.u32 %r13, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6; + // end inline asm + // begin inline asm + mov.u64 %rd7, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r14, %r5; + mov.u32 %r15, %r5; + mov.u32 %r16, %r5; + mov.u32 %r17, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd5 + 0 ], %rd7; + // end inline asm + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r18, %r5; + mov.u32 %r19, %r5; + mov.u32 %r20, %r5; + mov.u32 %r21, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd5 + 0 ], %rd8; + // end inline asm + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r22, %r5; + mov.u32 %r23, %r5; + mov.u32 %r24, %r5; + mov.u32 %r25, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r22, %r23, %r24, %r25 }, [ %rd5 + 0 ], %rd9; + // end inline asm + mov.b32 %r193, 0f43000000; + .loc 1 40 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19 + div.full.f32 %r194, %r10, %r193; + div.full.f32 %r195, %r11, %r193; + div.full.f32 %r196, %r12, %r193; + div.full.f32 %r197, %r13, %r193; + div.full.f32 %r198, %r14, %r193; + div.full.f32 %r199, %r15, %r193; + div.full.f32 %r200, %r16, %r193; + div.full.f32 %r201, %r17, %r193; + div.full.f32 %r202, %r18, %r193; + div.full.f32 %r203, %r19, %r193; + div.full.f32 %r204, %r20, %r193; + div.full.f32 %r205, %r21, %r193; + div.full.f32 %r206, %r22, %r193; + div.full.f32 %r207, %r23, %r193; + div.full.f32 %r208, %r24, %r193; + div.full.f32 %r209, %r25, %r193; + .loc 1 42 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19 + add.f32 %r210, %r194, 0f358637BD; + add.f32 %r211, %r195, 0f358637BD; + add.f32 %r212, %r196, 0f358637BD; + add.f32 %r213, %r197, 0f358637BD; + add.f32 %r214, %r198, 0f358637BD; + add.f32 %r215, %r199, 0f358637BD; + add.f32 %r216, %r200, 0f358637BD; + add.f32 %r217, %r201, 0f358637BD; + add.f32 %r218, %r202, 0f358637BD; + add.f32 %r219, %r203, 0f358637BD; + add.f32 %r220, %r204, 0f358637BD; + add.f32 %r221, %r205, 0f358637BD; + add.f32 %r222, %r206, 0f358637BD; + add.f32 %r223, %r207, 0f358637BD; + add.f32 %r224, %r208, 0f358637BD; + add.f32 %r225, %r209, 0f358637BD; + .loc 1 43 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28 + rsqrt.approx.ftz.f32 %r226, %r210; + rsqrt.approx.ftz.f32 %r227, %r211; + rsqrt.approx.ftz.f32 %r228, %r212; + rsqrt.approx.ftz.f32 %r229, %r213; + rsqrt.approx.ftz.f32 %r230, %r214; + rsqrt.approx.ftz.f32 %r231, %r215; + rsqrt.approx.ftz.f32 %r232, %r216; + rsqrt.approx.ftz.f32 %r233, %r217; + rsqrt.approx.ftz.f32 %r234, %r218; + rsqrt.approx.ftz.f32 %r235, %r219; + rsqrt.approx.ftz.f32 %r236, %r220; + rsqrt.approx.ftz.f32 %r237, %r221; + rsqrt.approx.ftz.f32 %r238, %r222; + rsqrt.approx.ftz.f32 %r239, %r223; + rsqrt.approx.ftz.f32 %r240, %r224; + rsqrt.approx.ftz.f32 %r241, %r225; + .loc 1 44 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19 + mul.f32 %r242, %r226, %r177; + mul.f32 %r243, %r227, %r178; + mul.f32 %r244, %r228, %r179; + mul.f32 %r245, %r229, %r180; + mul.f32 %r246, %r230, %r181; + mul.f32 %r247, %r231, %r182; + mul.f32 %r248, %r232, %r183; + mul.f32 %r249, %r233, %r184; + mul.f32 %r250, %r234, %r185; + mul.f32 %r251, %r235, %r186; + mul.f32 %r252, %r236, %r187; + mul.f32 %r253, %r237, %r188; + mul.f32 %r254, %r238, %r189; + mul.f32 %r255, %r239, %r190; + mul.f32 %r256, %r240, %r191; + mul.f32 %r257, %r241, %r192; + bar.sync 0; + shl.b32 %r258, %r135, 9; + shl.b32 %r259, %r114, 2; + shr.u32 %r260, %r79, 1; + and.b32 %r261, %r260, 76; + or.b32 %r262, %r258, %r112; + or.b32 %r263, %r259, %r261; + xor.b32 %r264, %r262, %r263; + add.s32 %r265, %r123, %r264; + st.shared.b32 [%r265], %r242; + xor.b32 %r266, %r264, 16; + add.s32 %r267, %r123, %r266; + st.shared.b32 [%r267+512], %r243; + xor.b32 %r268, %r264, 32; + add.s32 %r269, %r123, %r268; + st.shared.b32 [%r269+1024], %r244; + xor.b32 %r270, %r264, 48; + add.s32 %r271, %r123, %r270; + st.shared.b32 [%r271+1536], %r245; + xor.b32 %r272, %r264, 4; + add.s32 %r273, %r123, %r272; + st.shared.b32 [%r273], %r246; + xor.b32 %r274, %r264, 20; + add.s32 %r275, %r123, %r274; + st.shared.b32 [%r275+512], %r247; + xor.b32 %r276, %r264, 36; + add.s32 %r277, %r123, %r276; + st.shared.b32 [%r277+1024], %r248; + xor.b32 %r278, %r264, 52; + add.s32 %r279, %r123, %r278; + st.shared.b32 [%r279+1536], %r249; + xor.b32 %r280, %r264, 8; + add.s32 %r281, %r123, %r280; + st.shared.b32 [%r281], %r250; + xor.b32 %r282, %r264, 24; + add.s32 %r283, %r123, %r282; + st.shared.b32 [%r283+512], %r251; + xor.b32 %r284, %r264, 40; + add.s32 %r285, %r123, %r284; + st.shared.b32 [%r285+1024], %r252; + xor.b32 %r286, %r264, 56; + add.s32 %r287, %r123, %r286; + st.shared.b32 [%r287+1536], %r253; + xor.b32 %r288, %r264, 12; + add.s32 %r289, %r123, %r288; + st.shared.b32 [%r289], %r254; + xor.b32 %r290, %r264, 28; + add.s32 %r291, %r123, %r290; + st.shared.b32 [%r291+512], %r255; + xor.b32 %r292, %r264, 44; + add.s32 %r293, %r123, %r292; + st.shared.b32 [%r293+1024], %r256; + xor.b32 %r294, %r264, 60; + add.s32 %r295, %r123, %r294; + st.shared.b32 [%r295+1536], %r257; + bar.sync 0; + shl.b32 %r296, %r79, 6; + and.b32 %r297, %r296, 1600; + and.b32 %r298, %r139, 60; + shr.u32 %r299, %r114, 1; + and.b32 %r300, %r117, 2112; + or.b32 %r301, %r297, %r298; + or.b32 %r302, %r300, %r299; + xor.b32 %r303, %r302, %r301; + add.s32 %r304, %r123, %r303; + ld.shared.b32 %r305, [%r304]; + ld.shared.b32 %r306, [%r304+128]; + ld.shared.b32 %r307, [%r304+256]; + ld.shared.b32 %r308, [%r304+384]; + xor.b32 %r309, %r303, 8; + add.s32 %r310, %r123, %r309; + ld.shared.b32 %r311, [%r310+8192]; + ld.shared.b32 %r312, [%r310+8320]; + ld.shared.b32 %r313, [%r310+8448]; + ld.shared.b32 %r314, [%r310+8576]; + xor.b32 %r315, %r303, 4; + add.s32 %r316, %r123, %r315; + ld.shared.b32 %r317, [%r316+4096]; + ld.shared.b32 %r318, [%r316+4224]; + ld.shared.b32 %r319, [%r316+4352]; + ld.shared.b32 %r320, [%r316+4480]; + xor.b32 %r321, %r303, 12; + add.s32 %r322, %r123, %r321; + ld.shared.b32 %r323, [%r322+12288]; + ld.shared.b32 %r324, [%r322+12416]; + ld.shared.b32 %r325, [%r322+12544]; + ld.shared.b32 %r326, [%r322+12672]; + .loc 1 45 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31 + mul.wide.s32 %rd34, %r92, 2; + add.s64 %rd10, %rd29, %rd34; + .loc 1 45 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + mov.u32 %r29, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd10 + 0 ], %rd11; + // end inline asm + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r30, %r5; + mov.u32 %r31, %r5; + mov.u32 %r32, %r5; + mov.u32 %r33, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd10 + 0 ], %rd12; + // end inline asm + .loc 1 54 45 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45 + add.s32 %r327, %r104, -3145728; + add.s32 %r328, %r105, -3145728; + .loc 1 54 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31 + mad.wide.s32 %rd13, %r327, 2, %rd30; + mad.wide.s32 %rd15, %r328, 2, %rd30; + .loc 1 54 83 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83 + add.s32 %r329, %r78, -8192; + setp.lt.u32 %p13, %r329, 65536; + and.pred %p4, %p8, %p13; + add.s32 %r330, %r78, -8160; + setp.lt.u32 %p14, %r330, 65568; + and.pred %p5, %p8, %p14; + and.pred %p6, %p9, %p13; + .loc 1 54 67 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67 + // begin inline asm + mov.u64 %rd14, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r34, %r5; + mov.u32 %r35, %r5; + mov.u32 %r36, %r5; + mov.u32 %r37, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd13 + 0 ], %rd14; + // end inline asm + prmt.b32 %r331, %r34, %r35, 0x7632U; + prmt.b32 %r332, %r36, %r37, 0x7632U; + // begin inline asm + mov.u64 %rd16, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r38, %r5; + mov.u32 %r39, %r5; + mov.u32 %r40, %r5; + mov.u32 %r41, %r5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r38, %r39, %r40, %r41 }, [ %rd15 + 0 ], %rd16; + // end inline asm + prmt.b32 %r333, %r38, %r39, 0x7632U; + prmt.b32 %r334, %r40, %r41, 0x7632U; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + bar.sync 0; + prmt.b32 %r335, %r34, %r35, 0x5410U; + st.shared.b32 [%r124], %r335; + prmt.b32 %r336, %r36, %r37, 0x5410U; + st.shared.b32 [%r124+128], %r336; + st.shared.b32 [%r128+4096], %r331; + st.shared.b32 [%r128+4224], %r332; + prmt.b32 %r337, %r38, %r39, 0x5410U; + st.shared.b32 [%r130+2048], %r337; + prmt.b32 %r338, %r40, %r41, 0x5410U; + st.shared.b32 [%r130+2176], %r338; + st.shared.b32 [%r134+6144], %r333; + st.shared.b32 [%r134+6272], %r334; + bar.sync 0; + ld.shared.b16 %rs17, [%r146]; + ld.shared.b16 %rs18, [%r148+256]; + ld.shared.b16 %rs19, [%r150+512]; + ld.shared.b16 %rs20, [%r152+768]; + ld.shared.b16 %rs21, [%r154]; + ld.shared.b16 %rs22, [%r156+256]; + ld.shared.b16 %rs23, [%r158+512]; + ld.shared.b16 %rs24, [%r160+768]; + ld.shared.b16 %rs25, [%r162]; + ld.shared.b16 %rs26, [%r164+256]; + ld.shared.b16 %rs27, [%r166+512]; + ld.shared.b16 %rs28, [%r168+768]; + ld.shared.b16 %rs29, [%r170]; + ld.shared.b16 %rs30, [%r172+256]; + ld.shared.b16 %rs31, [%r174+512]; + ld.shared.b16 %rs32, [%r176+768]; + .loc 1 56 52 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52 + add.s32 %r339, %r85, -8192; + .loc 1 56 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31 + mad.wide.s32 %rd17, %r339, 4, %rd31; + .loc 1 56 90 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r42, %r5; + mov.u32 %r43, %r5; + mov.u32 %r44, %r5; + mov.u32 %r45, %r5; + @%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd17 + 0 ], %rd18; + // end inline asm + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r46, %r5; + mov.u32 %r47, %r5; + mov.u32 %r48, %r5; + mov.u32 %r49, %r5; + @%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd17 + 0 ], %rd19; + // end inline asm + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r50, %r5; + mov.u32 %r51, %r5; + mov.u32 %r52, %r5; + mov.u32 %r53, %r5; + @%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd17 + 0 ], %rd20; + // end inline asm + // begin inline asm + mov.u64 %rd21, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r54, %r5; + mov.u32 %r55, %r5; + mov.u32 %r56, %r5; + mov.u32 %r57, %r5; + @%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd17 + 0 ], %rd21; + // end inline asm + .loc 1 58 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21 + div.full.f32 %r340, %r42, %r193; + div.full.f32 %r341, %r43, %r193; + div.full.f32 %r342, %r44, %r193; + div.full.f32 %r343, %r45, %r193; + div.full.f32 %r344, %r46, %r193; + div.full.f32 %r345, %r47, %r193; + div.full.f32 %r346, %r48, %r193; + div.full.f32 %r347, %r49, %r193; + div.full.f32 %r348, %r50, %r193; + div.full.f32 %r349, %r51, %r193; + div.full.f32 %r350, %r52, %r193; + div.full.f32 %r351, %r53, %r193; + div.full.f32 %r352, %r54, %r193; + div.full.f32 %r353, %r55, %r193; + div.full.f32 %r354, %r56, %r193; + div.full.f32 %r355, %r57, %r193; + .loc 1 60 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20 + add.f32 %r356, %r340, 0f358637BD; + add.f32 %r357, %r341, 0f358637BD; + add.f32 %r358, %r342, 0f358637BD; + add.f32 %r359, %r343, 0f358637BD; + add.f32 %r360, %r344, 0f358637BD; + add.f32 %r361, %r345, 0f358637BD; + add.f32 %r362, %r346, 0f358637BD; + add.f32 %r363, %r347, 0f358637BD; + add.f32 %r364, %r348, 0f358637BD; + add.f32 %r365, %r349, 0f358637BD; + add.f32 %r366, %r350, 0f358637BD; + add.f32 %r367, %r351, 0f358637BD; + add.f32 %r368, %r352, 0f358637BD; + add.f32 %r369, %r353, 0f358637BD; + add.f32 %r370, %r354, 0f358637BD; + add.f32 %r371, %r355, 0f358637BD; + .loc 1 61 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28 + rsqrt.approx.ftz.f32 %r372, %r356; + rsqrt.approx.ftz.f32 %r373, %r357; + rsqrt.approx.ftz.f32 %r374, %r358; + rsqrt.approx.ftz.f32 %r375, %r359; + rsqrt.approx.ftz.f32 %r376, %r360; + rsqrt.approx.ftz.f32 %r377, %r361; + rsqrt.approx.ftz.f32 %r378, %r362; + rsqrt.approx.ftz.f32 %r379, %r363; + rsqrt.approx.ftz.f32 %r380, %r364; + rsqrt.approx.ftz.f32 %r381, %r365; + rsqrt.approx.ftz.f32 %r382, %r366; + rsqrt.approx.ftz.f32 %r383, %r367; + rsqrt.approx.ftz.f32 %r384, %r368; + rsqrt.approx.ftz.f32 %r385, %r369; + rsqrt.approx.ftz.f32 %r386, %r370; + rsqrt.approx.ftz.f32 %r387, %r371; + .loc 1 23 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21 + setp.lt.s32 %p15, %r83, 73728; + .loc 1 35 18 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18 + setp.lt.s32 %p16, %r84, 8192; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + cvt.f32.bf16 %r388, %rs32; + cvt.f32.bf16 %r389, %rs31; + cvt.f32.bf16 %r390, %rs30; + cvt.f32.bf16 %r391, %rs29; + cvt.f32.bf16 %r392, %rs28; + cvt.f32.bf16 %r393, %rs27; + cvt.f32.bf16 %r394, %rs26; + cvt.f32.bf16 %r395, %rs25; + cvt.f32.bf16 %r396, %rs24; + cvt.f32.bf16 %r397, %rs23; + cvt.f32.bf16 %r398, %rs22; + cvt.f32.bf16 %r399, %rs21; + cvt.f32.bf16 %r400, %rs20; + cvt.f32.bf16 %r401, %rs19; + cvt.f32.bf16 %r402, %rs18; + cvt.f32.bf16 %r403, %rs17; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r404, %r372, %r403; + mul.f32 %r405, %r373, %r402; + mul.f32 %r406, %r374, %r401; + mul.f32 %r407, %r375, %r400; + mul.f32 %r408, %r376, %r399; + mul.f32 %r409, %r377, %r398; + mul.f32 %r410, %r378, %r397; + mul.f32 %r411, %r379, %r396; + mul.f32 %r412, %r380, %r395; + mul.f32 %r413, %r381, %r394; + mul.f32 %r414, %r382, %r393; + mul.f32 %r415, %r383, %r392; + mul.f32 %r416, %r384, %r391; + mul.f32 %r417, %r385, %r390; + mul.f32 %r418, %r386, %r389; + mul.f32 %r419, %r387, %r388; + bar.sync 0; + st.shared.b32 [%r265], %r404; + st.shared.b32 [%r267+512], %r405; + st.shared.b32 [%r269+1024], %r406; + st.shared.b32 [%r271+1536], %r407; + st.shared.b32 [%r273], %r408; + st.shared.b32 [%r275+512], %r409; + st.shared.b32 [%r277+1024], %r410; + st.shared.b32 [%r279+1536], %r411; + st.shared.b32 [%r281], %r412; + st.shared.b32 [%r283+512], %r413; + st.shared.b32 [%r285+1024], %r414; + st.shared.b32 [%r287+1536], %r415; + st.shared.b32 [%r289], %r416; + st.shared.b32 [%r291+512], %r417; + st.shared.b32 [%r293+1024], %r418; + st.shared.b32 [%r295+1536], %r419; + bar.sync 0; + ld.shared.b32 %r420, [%r304]; + ld.shared.b32 %r421, [%r304+128]; + ld.shared.b32 %r422, [%r304+256]; + ld.shared.b32 %r423, [%r304+384]; + ld.shared.b32 %r424, [%r310+8192]; + ld.shared.b32 %r425, [%r310+8320]; + ld.shared.b32 %r426, [%r310+8448]; + ld.shared.b32 %r427, [%r310+8576]; + ld.shared.b32 %r428, [%r316+4096]; + ld.shared.b32 %r429, [%r316+4224]; + ld.shared.b32 %r430, [%r316+4352]; + ld.shared.b32 %r431, [%r316+4480]; + ld.shared.b32 %r432, [%r322+12288]; + ld.shared.b32 %r433, [%r322+12416]; + ld.shared.b32 %r434, [%r322+12544]; + ld.shared.b32 %r435, [%r322+12672]; + .loc 1 63 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31 + add.s64 %rd22, %rd32, %rd34; + .loc 1 63 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r58, %r5; + mov.u32 %r59, %r5; + mov.u32 %r60, %r5; + mov.u32 %r61, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd22 + 0 ], %rd23; + // end inline asm + // begin inline asm + mov.u64 %rd24, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r62, %r5; + mov.u32 %r63, %r5; + mov.u32 %r64, %r5; + mov.u32 %r65, %r5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r62, %r63, %r64, %r65 }, [ %rd22 + 0 ], %rd24; + // end inline asm + .loc 1 70 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34 + shl.b32 %r436, %r83, 7; + shl.b32 %r437, %r84, 7; + .loc 1 70 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30 + add.s32 %r438, %r436, %r92; + add.s32 %r439, %r437, %r92; + .loc 1 70 25 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25 + mad.wide.s32 %rd25, %r438, 2, %rd33; + mad.wide.s32 %rd26, %r439, 2, %rd33; + .loc 1 70 54 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54 + and.pred %p7, %p8, %p15; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs33, %rs34}, %r26; + cvt.f32.bf16 %r440, %rs33; + cvt.f32.bf16 %r441, %rs34; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r442, %r311, %r441; + mul.f32 %r443, %r305, %r440; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs35, %rs36}, %r58; + cvt.f32.bf16 %r444, %rs35; + cvt.f32.bf16 %r445, %rs36; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r446, %r424, %r445; + mul.f32 %r447, %r420, %r444; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r448, %r443, %r447, %p10; + selp.f32 %r449, %r442, %r446, %p10; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r66, %r449, %r448; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs37, %rs38}, %r27; + cvt.f32.bf16 %r450, %rs37; + cvt.f32.bf16 %r451, %rs38; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r452, %r312, %r451; + mul.f32 %r453, %r306, %r450; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs39, %rs40}, %r59; + cvt.f32.bf16 %r454, %rs39; + cvt.f32.bf16 %r455, %rs40; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r456, %r425, %r455; + mul.f32 %r457, %r421, %r454; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r458, %r453, %r457, %p10; + selp.f32 %r459, %r452, %r456, %p10; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r67, %r459, %r458; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs41, %rs42}, %r28; + cvt.f32.bf16 %r460, %rs41; + cvt.f32.bf16 %r461, %rs42; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r462, %r313, %r461; + mul.f32 %r463, %r307, %r460; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs43, %rs44}, %r60; + cvt.f32.bf16 %r464, %rs43; + cvt.f32.bf16 %r465, %rs44; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r466, %r426, %r465; + mul.f32 %r467, %r422, %r464; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r468, %r463, %r467, %p10; + selp.f32 %r469, %r462, %r466, %p10; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r68, %r469, %r468; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs45, %rs46}, %r29; + cvt.f32.bf16 %r470, %rs45; + cvt.f32.bf16 %r471, %rs46; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r472, %r314, %r471; + mul.f32 %r473, %r308, %r470; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs47, %rs48}, %r61; + cvt.f32.bf16 %r474, %rs47; + cvt.f32.bf16 %r475, %rs48; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r476, %r427, %r475; + mul.f32 %r477, %r423, %r474; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r478, %r473, %r477, %p10; + selp.f32 %r479, %r472, %r476, %p10; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r69, %r479, %r478; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs49, %rs50}, %r30; + cvt.f32.bf16 %r480, %rs49; + cvt.f32.bf16 %r481, %rs50; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r482, %r323, %r481; + mul.f32 %r483, %r317, %r480; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs51, %rs52}, %r62; + cvt.f32.bf16 %r484, %rs51; + cvt.f32.bf16 %r485, %rs52; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r486, %r432, %r485; + mul.f32 %r487, %r428, %r484; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r488, %r483, %r487, %p16; + selp.f32 %r489, %r482, %r486, %p16; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r70, %r489, %r488; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs53, %rs54}, %r31; + cvt.f32.bf16 %r490, %rs53; + cvt.f32.bf16 %r491, %rs54; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r492, %r324, %r491; + mul.f32 %r493, %r318, %r490; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs55, %rs56}, %r63; + cvt.f32.bf16 %r494, %rs55; + cvt.f32.bf16 %r495, %rs56; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r496, %r433, %r495; + mul.f32 %r497, %r429, %r494; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r498, %r493, %r497, %p16; + selp.f32 %r499, %r492, %r496, %p16; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r71, %r499, %r498; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs57, %rs58}, %r32; + cvt.f32.bf16 %r500, %rs57; + cvt.f32.bf16 %r501, %rs58; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r502, %r325, %r501; + mul.f32 %r503, %r319, %r500; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs59, %rs60}, %r64; + cvt.f32.bf16 %r504, %rs59; + cvt.f32.bf16 %r505, %rs60; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r506, %r434, %r505; + mul.f32 %r507, %r430, %r504; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r508, %r503, %r507, %p16; + selp.f32 %r509, %r502, %r506, %p16; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r72, %r509, %r508; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs61, %rs62}, %r33; + cvt.f32.bf16 %r510, %rs61; + cvt.f32.bf16 %r511, %rs62; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r512, %r326, %r511; + mul.f32 %r513, %r320, %r510; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs63, %rs64}, %r65; + cvt.f32.bf16 %r514, %rs63; + cvt.f32.bf16 %r515, %rs64; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r516, %r435, %r515; + mul.f32 %r517, %r431, %r514; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r518, %r513, %r517, %p16; + selp.f32 %r519, %r512, %r516, %p16; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r73, %r519, %r518; + // begin inline asm + @%p7 st.global.v4.b32 [ %rd25 + 0 ], { %r66, %r67, %r68, %r69 }; + // end inline asm + // begin inline asm + @%p7 st.global.v4.b32 [ %rd26 + 0 ], { %r70, %r71, %r72, %r73 }; + // end inline asm + .loc 1 70 4 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 50 +.b8 104 +.b8 105 +.b8 106 +.b8 51 +.b8 104 +.b8 109 +.b8 108 +.b8 111 +.b8 117 +.b8 109 +.b8 120 +.b8 100 +.b8 109 +.b8 104 +.b8 117 +.b8 101 +.b8 122 +.b8 115 +.b8 121 +.b8 104 +.b8 107 +.b8 109 +.b8 110 +.b8 113 +.b8 103 +.b8 110 +.b8 102 +.b8 97 +.b8 53 +.b8 105 +.b8 118 +.b8 114 +.b8 101 +.b8 50 +.b8 55 +.b8 117 +.b8 111 +.b8 115 +.b8 121 +.b8 109 +.b8 97 +.b8 109 +.b8 51 +.b8 100 +.b8 114 +.b8 55 +.b8 97 +.b8 53 +.b8 120 +.b8 98 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 50 +.b8 104 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.source new file mode 100644 index 0000000000000000000000000000000000000000..b94eb6d636bfed0194580b375716177991cbaafa --- /dev/null +++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.source @@ -0,0 +1,415 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc99 = loc("in_ptr0"(#loc)) +#loc100 = loc("in_ptr1"(#loc)) +#loc101 = loc("in_ptr2"(#loc)) +#loc102 = loc("in_ptr3"(#loc)) +#loc103 = loc("in_ptr4"(#loc)) +#loc104 = loc("in_ptr5"(#loc)) +#loc105 = loc("out_ptr0"(#loc)) +#loc106 = loc("ynumel"(#loc)) +#loc107 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %ynumel_0 = arith.constant 73728 : i32 loc(#loc108) + %xnumel_1 = arith.constant 128 : i32 loc(#loc109) + %yoffset = tt.get_program_id y : i32 loc(#loc110) + %yoffset_2 = tt.get_program_id z : i32 loc(#loc111) + %yoffset_3 = tt.get_num_programs y : i32 loc(#loc112) + %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc113) + %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc114) + %yoffset_6 = arith.constant 64 : i32 loc(#loc115) + %yoffset_7 = arith.constant 64 : i32 loc(#loc115) + %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc115) + %yindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc116) + %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc117) + %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<64x1xi32> loc(#loc118) + %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<64x1xi32> loc(#loc118) + %ymask = arith.constant dense<73728> : tensor<64x1xi32> loc(#loc119) + %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<64x1xi32> loc(#loc119) + %xoffset = tt.get_program_id x : i32 loc(#loc120) + %xoffset_13 = arith.constant 64 : i32 loc(#loc121) + %xoffset_14 = arith.constant 64 : i32 loc(#loc121) + %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc121) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc122) + %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc123) + %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x64xi32> loc(#loc124) + %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x64xi32> loc(#loc124) + %xmask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc125) + %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x64xi32> loc(#loc125) + %y1 = arith.constant 32 : i32 loc(#loc126) + %y1_20 = arith.constant 32 : i32 loc(#loc126) + %y1_21 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc126) + %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<64x1xi32> loc(#loc126) + %y0 = arith.constant 32 : i32 loc(#loc127) + %y0_23 = arith.constant 32 : i32 loc(#loc127) + %y0_24 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc127) + %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<64x1xi32> loc(#loc127) + %tmp1 = arith.constant 0 : i64 loc(#loc128) + %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc128) + %tmp2 = arith.extsi %y1_22 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc129) + %tmp2_27 = arith.constant dense<0> : tensor<64x1xi64> loc(#loc129) + %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<64x1xi64> loc(#loc129) + %tmp3 = arith.constant 256 : i64 loc(#loc130) + %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc130) + %tmp4 = arith.extsi %y1_22 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc131) + %tmp4_30 = arith.constant dense<256> : tensor<64x1xi64> loc(#loc131) + %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<64x1xi64> loc(#loc131) + %tmp5 = arith.constant 128 : i32 loc(#loc132) + %tmp5_32 = arith.constant 128 : i32 loc(#loc132) + %tmp5_33 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc132) + %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<64x1xi32> loc(#loc132) + %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc133) + %tmp5_36 = tt.broadcast %tmp5_34 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc133) + %tmp5_37 = arith.addi %tmp5_35, %tmp5_36 : tensor<64x64xi32> loc(#loc133) + %tmp5_38 = arith.constant 12288 : i32 loc(#loc134) + %tmp5_39 = arith.constant 12288 : i32 loc(#loc134) + %tmp5_40 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc134) + %tmp5_41 = arith.muli %tmp5_40, %y1_22 : tensor<64x1xi32> loc(#loc134) + %tmp5_42 = tt.broadcast %tmp5_41 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc135) + %tmp5_43 = arith.addi %tmp5_37, %tmp5_42 : tensor<64x64xi32> loc(#loc135) + %tmp5_44 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc136) + %tmp5_45 = tt.addptr %tmp5_44, %tmp5_43 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc136) + %tmp5_46 = tt.broadcast %tmp4_31 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc137) + %tmp5_47 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc137) + %tmp5_48 = arith.andi %tmp5_46, %tmp5_47 : tensor<64x64xi1> loc(#loc137) + %tmp5_49 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc138) + %tmp5_50 = arith.andi %tmp5_48, %tmp5_49 : tensor<64x64xi1> loc(#loc138) + %tmp5_51 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp5_52 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc139) + %tmp5_53 = arith.truncf %tmp5_52 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc139) + %tmp5_54 = tt.load %tmp5_45, %tmp5_50, %tmp5_53 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc139) + %tmp5_55 = arith.extf %tmp5_54 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc140) + %tmp7 = arith.constant 32 : i32 loc(#loc141) + %tmp7_56 = arith.constant 32 : i32 loc(#loc141) + %tmp7_57 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc141) + %tmp7_58 = arith.muli %tmp7_57, %y1_22 : tensor<64x1xi32> loc(#loc141) + %tmp7_59 = arith.addi %y0_25, %tmp7_58 : tensor<64x1xi32> loc(#loc142) + %tmp7_60 = tt.broadcast %tmp7_59 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc143) + %tmp7_61 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc144) + %tmp7_62 = tt.addptr %tmp7_61, %tmp7_60 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc144) + %tmp7_63 = tt.broadcast %tmp4_31 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc145) + %tmp7_64 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc145) + %tmp7_65 = arith.andi %tmp7_63, %tmp7_64 : tensor<64x64xi1> loc(#loc145) + %tmp7_66 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc146) + %tmp7_67 = arith.andi %tmp7_65, %tmp7_66 : tensor<64x64xi1> loc(#loc146) + %tmp7_68 = arith.constant 0.000000e+00 : f32 loc(#loc147) + %tmp7_69 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc147) + %tmp7_70 = tt.load %tmp7_62, %tmp7_67, %tmp7_69 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc147) + %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc148) + %tmp9 = arith.constant dense<1.280000e+02> : tensor<64x64xf32> loc(#loc149) + %tmp9_71 = arith.divf %tmp7_70, %tmp9 : tensor<64x64xf32> loc(#loc149) + %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc150) + %tmp11 = arith.constant dense<9.99999997E-7> : tensor<64x64xf32> loc(#loc151) + %tmp11_72 = arith.addf %tmp9_71, %tmp11 : tensor<64x64xf32> loc(#loc151) + %tmp12 = tt.extern_elementwise %tmp11_72 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32>) -> tensor<64x64xf32> loc(#loc152) + %tmp13 = arith.mulf %tmp5_55, %tmp12 : tensor<64x64xf32> loc(#loc153) + %tmp14 = tt.broadcast %xindex_18 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc154) + %tmp14_73 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc155) + %tmp14_74 = tt.addptr %tmp14_73, %tmp14 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc155) + %tmp14_75 = tt.broadcast %tmp4_31 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc156) + %tmp14_76 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc156) + %tmp14_77 = arith.andi %tmp14_75, %tmp14_76 : tensor<64x64xi1> loc(#loc156) + %tmp14_78 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc157) + %tmp14_79 = arith.andi %tmp14_77, %tmp14_78 : tensor<64x64xi1> loc(#loc157) + %tmp14_80 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp14_81 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc158) + %tmp14_82 = arith.truncf %tmp14_81 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc158) + %tmp14_83 = tt.load %tmp14_74, %tmp14_79, %tmp14_82 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc158) + %tmp14_84 = arith.extf %tmp14_83 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc159) + %tmp16 = arith.mulf %tmp13, %tmp14_84 : tensor<64x64xf32> loc(#loc160) + %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc161) + %tmp18_85 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc161) + %tmp19 = tt.broadcast %tmp4_31 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc162) + %tmp19_86 = arith.select %tmp19, %tmp16, %tmp18_85 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc162) + %tmp20 = arith.extsi %y1_22 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc163) + %tmp20_87 = arith.constant dense<256> : tensor<64x1xi64> loc(#loc163) + %tmp20_88 = arith.cmpi sge, %tmp20, %tmp20_87 : tensor<64x1xi64> loc(#loc163) + %tmp21 = arith.constant 2304 : i64 loc(#loc164) + %tmp21_89 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc164) + %tmp22 = arith.extsi %y1_22 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc165) + %tmp22_90 = arith.constant dense<2304> : tensor<64x1xi64> loc(#loc165) + %tmp22_91 = arith.cmpi slt, %tmp22, %tmp22_90 : tensor<64x1xi64> loc(#loc165) + %tmp23 = arith.constant 128 : i32 loc(#loc166) + %tmp23_92 = arith.constant 128 : i32 loc(#loc166) + %tmp23_93 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc166) + %tmp23_94 = arith.muli %tmp23_93, %y0_25 : tensor<64x1xi32> loc(#loc166) + %tmp23_95 = tt.broadcast %xindex_18 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc167) + %tmp23_96 = tt.broadcast %tmp23_94 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc167) + %tmp23_97 = arith.addi %tmp23_95, %tmp23_96 : tensor<64x64xi32> loc(#loc167) + %tmp23_98 = arith.constant -256 : i32 loc(#loc168) + %tmp23_99 = arith.constant -256 : i32 loc(#loc168) + %tmp23_100 = arith.constant dense<-256> : tensor<64x1xi32> loc(#loc168) + %tmp23_101 = arith.addi %tmp23_100, %y1_22 : tensor<64x1xi32> loc(#loc168) + %tmp23_102 = arith.constant 12288 : i32 loc(#loc169) + %tmp23_103 = arith.constant 12288 : i32 loc(#loc169) + %tmp23_104 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc169) + %tmp23_105 = arith.muli %tmp23_104, %tmp23_101 : tensor<64x1xi32> loc(#loc169) + %tmp23_106 = tt.broadcast %tmp23_105 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc170) + %tmp23_107 = arith.addi %tmp23_97, %tmp23_106 : tensor<64x64xi32> loc(#loc170) + %tmp23_108 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc171) + %tmp23_109 = tt.addptr %tmp23_108, %tmp23_107 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc171) + %tmp23_110 = tt.broadcast %tmp20_88 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc172) + %tmp23_111 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc172) + %tmp23_112 = arith.andi %tmp23_110, %tmp23_111 : tensor<64x64xi1> loc(#loc172) + %tmp23_113 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc173) + %tmp23_114 = arith.andi %tmp23_112, %tmp23_113 : tensor<64x64xi1> loc(#loc173) + %tmp23_115 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %tmp23_116 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc174) + %tmp23_117 = arith.truncf %tmp23_116 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc174) + %tmp23_118 = tt.load %tmp23_109, %tmp23_114, %tmp23_117 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc174) + %tmp23_119 = arith.extf %tmp23_118 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc175) + %tmp25 = arith.constant -256 : i32 loc(#loc176) + %tmp25_120 = arith.constant -256 : i32 loc(#loc176) + %tmp25_121 = arith.constant dense<-256> : tensor<64x1xi32> loc(#loc176) + %tmp25_122 = arith.addi %tmp25_121, %y1_22 : tensor<64x1xi32> loc(#loc176) + %tmp25_123 = arith.constant 32 : i32 loc(#loc177) + %tmp25_124 = arith.constant 32 : i32 loc(#loc177) + %tmp25_125 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc177) + %tmp25_126 = arith.muli %tmp25_125, %tmp25_122 : tensor<64x1xi32> loc(#loc177) + %tmp25_127 = arith.addi %y0_25, %tmp25_126 : tensor<64x1xi32> loc(#loc178) + %tmp25_128 = tt.broadcast %tmp25_127 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc179) + %tmp25_129 = tt.splat %in_ptr4 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc180) + %tmp25_130 = tt.addptr %tmp25_129, %tmp25_128 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc180) + %tmp25_131 = tt.broadcast %tmp20_88 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc181) + %tmp25_132 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc181) + %tmp25_133 = arith.andi %tmp25_131, %tmp25_132 : tensor<64x64xi1> loc(#loc181) + %tmp25_134 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc182) + %tmp25_135 = arith.andi %tmp25_133, %tmp25_134 : tensor<64x64xi1> loc(#loc182) + %tmp25_136 = arith.constant 0.000000e+00 : f32 loc(#loc183) + %tmp25_137 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc183) + %tmp25_138 = tt.load %tmp25_130, %tmp25_135, %tmp25_137 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc183) + %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc184) + %tmp27 = arith.constant dense<1.280000e+02> : tensor<64x64xf32> loc(#loc185) + %tmp27_139 = arith.divf %tmp25_138, %tmp27 : tensor<64x64xf32> loc(#loc185) + %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc186) + %tmp29 = arith.constant dense<9.99999997E-7> : tensor<64x64xf32> loc(#loc187) + %tmp29_140 = arith.addf %tmp27_139, %tmp29 : tensor<64x64xf32> loc(#loc187) + %tmp30 = tt.extern_elementwise %tmp29_140 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32>) -> tensor<64x64xf32> loc(#loc188) + %tmp31 = arith.mulf %tmp23_119, %tmp30 : tensor<64x64xf32> loc(#loc189) + %tmp32 = tt.broadcast %xindex_18 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc190) + %tmp32_141 = tt.splat %in_ptr5 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc191) + %tmp32_142 = tt.addptr %tmp32_141, %tmp32 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc191) + %tmp32_143 = tt.broadcast %tmp20_88 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc192) + %tmp32_144 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc192) + %tmp32_145 = arith.andi %tmp32_143, %tmp32_144 : tensor<64x64xi1> loc(#loc192) + %tmp32_146 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc193) + %tmp32_147 = arith.andi %tmp32_145, %tmp32_146 : tensor<64x64xi1> loc(#loc193) + %tmp32_148 = arith.constant 0.000000e+00 : f32 loc(#loc194) + %tmp32_149 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc194) + %tmp32_150 = arith.truncf %tmp32_149 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc194) + %tmp32_151 = tt.load %tmp32_142, %tmp32_147, %tmp32_150 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc194) + %tmp32_152 = arith.extf %tmp32_151 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc195) + %tmp34 = arith.mulf %tmp31, %tmp32_152 : tensor<64x64xf32> loc(#loc196) + %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc197) + %tmp36_153 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc197) + %tmp37 = tt.broadcast %tmp20_88 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc198) + %tmp37_154 = arith.select %tmp37, %tmp34, %tmp36_153 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc198) + %tmp38 = tt.broadcast %tmp4_31 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc199) + %tmp38_155 = arith.select %tmp38, %tmp19_86, %tmp37_154 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc199) + %c128_i32 = arith.constant 128 : i32 loc(#loc93) + %c128_i32_156 = arith.constant 128 : i32 loc(#loc93) + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc93) + %0 = arith.muli %cst, %yindex_11 : tensor<64x1xi32> loc(#loc93) + %1 = tt.broadcast %xindex_18 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc94) + %2 = tt.broadcast %0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc94) + %3 = arith.addi %1, %2 : tensor<64x64xi32> loc(#loc94) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc95) + %5 = tt.addptr %4, %3 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc95) + %6 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc96) + %7 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc96) + %8 = arith.andi %6, %7 : tensor<64x64xi1> loc(#loc96) + %9 = arith.truncf %tmp38_155 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc97) + tt.store %5, %9, %8 : tensor<64x64x!tt.ptr> loc(#loc97) + tt.return loc(#loc98) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:60) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:70) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc108 = loc("ynumel"(#loc1)) +#loc109 = loc("xnumel"(#loc2)) +#loc110 = loc("yoffset"(#loc3)) +#loc111 = loc("yoffset"(#loc4)) +#loc112 = loc("yoffset"(#loc5)) +#loc113 = loc("yoffset"(#loc6)) +#loc114 = loc("yoffset"(#loc7)) +#loc115 = loc("yoffset"(#loc8)) +#loc116 = loc("yindex"(#loc9)) +#loc117 = loc("yindex"(#loc10)) +#loc118 = loc("yindex"(#loc11)) +#loc119 = loc("ymask"(#loc12)) +#loc120 = loc("xoffset"(#loc13)) +#loc121 = loc("xoffset"(#loc14)) +#loc122 = loc("xindex"(#loc15)) +#loc123 = loc("xindex"(#loc16)) +#loc124 = loc("xindex"(#loc17)) +#loc125 = loc("xmask"(#loc18)) +#loc126 = loc("y1"(#loc19)) +#loc127 = loc("y0"(#loc20)) +#loc128 = loc("tmp1"(#loc21)) +#loc129 = loc("tmp2"(#loc22)) +#loc130 = loc("tmp3"(#loc23)) +#loc131 = loc("tmp4"(#loc24)) +#loc132 = loc("tmp5"(#loc25)) +#loc133 = loc("tmp5"(#loc26)) +#loc134 = loc("tmp5"(#loc27)) +#loc135 = loc("tmp5"(#loc28)) +#loc136 = loc("tmp5"(#loc29)) +#loc137 = loc("tmp5"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp7"(#loc34)) +#loc142 = loc("tmp7"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp7"(#loc37)) +#loc145 = loc("tmp7"(#loc38)) +#loc146 = loc("tmp7"(#loc39)) +#loc147 = loc("tmp7"(#loc40)) +#loc148 = loc("tmp8"(#loc41)) +#loc149 = loc("tmp9"(#loc42)) +#loc150 = loc("tmp10"(#loc43)) +#loc151 = loc("tmp11"(#loc44)) +#loc152 = loc("tmp12"(#loc45)) +#loc153 = loc("tmp13"(#loc46)) +#loc154 = loc("tmp14"(#loc47)) +#loc155 = loc("tmp14"(#loc48)) +#loc156 = loc("tmp14"(#loc49)) +#loc157 = loc("tmp14"(#loc50)) +#loc158 = loc("tmp14"(#loc51)) +#loc159 = loc("tmp14"(#loc52)) +#loc160 = loc("tmp16"(#loc53)) +#loc161 = loc("tmp18"(#loc54)) +#loc162 = loc("tmp19"(#loc55)) +#loc163 = loc("tmp20"(#loc56)) +#loc164 = loc("tmp21"(#loc57)) +#loc165 = loc("tmp22"(#loc58)) +#loc166 = loc("tmp23"(#loc59)) +#loc167 = loc("tmp23"(#loc60)) +#loc168 = loc("tmp23"(#loc61)) +#loc169 = loc("tmp23"(#loc62)) +#loc170 = loc("tmp23"(#loc63)) +#loc171 = loc("tmp23"(#loc64)) +#loc172 = loc("tmp23"(#loc65)) +#loc173 = loc("tmp23"(#loc66)) +#loc174 = loc("tmp23"(#loc67)) +#loc175 = loc("tmp23"(#loc68)) +#loc176 = loc("tmp25"(#loc69)) +#loc177 = loc("tmp25"(#loc70)) +#loc178 = loc("tmp25"(#loc71)) +#loc179 = loc("tmp25"(#loc72)) +#loc180 = loc("tmp25"(#loc73)) +#loc181 = loc("tmp25"(#loc74)) +#loc182 = loc("tmp25"(#loc75)) +#loc183 = loc("tmp25"(#loc76)) +#loc184 = loc("tmp26"(#loc77)) +#loc185 = loc("tmp27"(#loc78)) +#loc186 = loc("tmp28"(#loc79)) +#loc187 = loc("tmp29"(#loc80)) +#loc188 = loc("tmp30"(#loc81)) +#loc189 = loc("tmp31"(#loc82)) +#loc190 = loc("tmp32"(#loc83)) +#loc191 = loc("tmp32"(#loc84)) +#loc192 = loc("tmp32"(#loc85)) +#loc193 = loc("tmp32"(#loc86)) +#loc194 = loc("tmp32"(#loc87)) +#loc195 = loc("tmp32"(#loc88)) +#loc196 = loc("tmp34"(#loc89)) +#loc197 = loc("tmp36"(#loc90)) +#loc198 = loc("tmp37"(#loc91)) +#loc199 = loc("tmp38"(#loc92)) diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..77ad0ab5b1f794f85b02769d464f0854ffadfa5a --- /dev/null +++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir @@ -0,0 +1,287 @@ +#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("in_ptr4"(#loc)) +#loc75 = loc("in_ptr5"(#loc)) +#loc76 = loc("out_ptr0"(#loc)) +#loc77 = loc("ynumel"(#loc)) +#loc78 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<-256> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<-256> : tensor<64x1xi32, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<256> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<256> : tensor<64x1xi64, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<32> : tensor<64x1xi32, #blocked1> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_9 = arith.constant dense<73728> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<73728> : tensor<64x1xi32, #blocked1> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked1> loc(#loc1) + %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<9.99999997E-7> : tensor<64x64xf32, #blocked> loc(#loc1) + %cst_14 = arith.constant dense<1.280000e+02> : tensor<64x64xf32, #blocked> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked1> loc(#loc1) + %yoffset = tt.get_program_id y : i32 loc(#loc79) + %yoffset_16 = tt.get_program_id z : i32 loc(#loc80) + %yoffset_17 = tt.get_num_programs y : i32 loc(#loc81) + %yoffset_18 = arith.muli %yoffset_16, %yoffset_17 : i32 loc(#loc82) + %yoffset_19 = arith.addi %yoffset, %yoffset_18 : i32 loc(#loc83) + %yoffset_20 = arith.muli %yoffset_19, %c64_i32 : i32 loc(#loc84) + %yindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc85) + %yindex_21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc85) + %yindex_22 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc85) + %yindex_23 = tt.expand_dims %yindex_21 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc85) + %yindex_24 = tt.splat %yoffset_20 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc86) + %yindex_25 = tt.splat %yoffset_20 : i32 -> tensor<64x1xi32, #blocked> loc(#loc86) + %yindex_26 = arith.addi %yindex_24, %yindex_22 : tensor<64x1xi32, #blocked1> loc(#loc86) + %yindex_27 = arith.addi %yindex_25, %yindex_23 : tensor<64x1xi32, #blocked> loc(#loc86) + %ymask = arith.cmpi slt, %yindex_26, %cst_10 : tensor<64x1xi32, #blocked1> loc(#loc87) + %ymask_28 = arith.cmpi slt, %yindex_27, %cst_9 : tensor<64x1xi32, #blocked> loc(#loc87) + %xoffset = tt.get_program_id x : i32 loc(#loc88) + %xoffset_29 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc89) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc90) + %xindex_30 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc90) + %xindex_31 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc90) + %xindex_32 = tt.expand_dims %xindex_30 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc90) + %xindex_33 = tt.splat %xoffset_29 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc91) + %xindex_34 = tt.splat %xoffset_29 : i32 -> tensor<1x64xi32, #blocked> loc(#loc91) + %xindex_35 = arith.addi %xindex_33, %xindex_31 : tensor<1x64xi32, #blocked1> loc(#loc91) + %xindex_36 = arith.addi %xindex_34, %xindex_32 : tensor<1x64xi32, #blocked> loc(#loc91) + %xmask = arith.cmpi slt, %xindex_35, %cst_8 : tensor<1x64xi32, #blocked1> loc(#loc92) + %xmask_37 = arith.cmpi slt, %xindex_36, %cst_7 : tensor<1x64xi32, #blocked> loc(#loc92) + %y1 = arith.divsi %yindex_26, %cst_6 : tensor<64x1xi32, #blocked1> loc(#loc93) + %y1_38 = arith.divsi %yindex_27, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc93) + %y0 = arith.remsi %yindex_26, %cst_6 : tensor<64x1xi32, #blocked1> loc(#loc94) + %y0_39 = arith.remsi %yindex_27, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc94) + %tmp4 = arith.extsi %y1 : tensor<64x1xi32, #blocked1> to tensor<64x1xi64, #blocked1> loc(#loc95) + %tmp4_40 = arith.extsi %y1_38 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> loc(#loc95) + %tmp4_41 = arith.cmpi slt, %tmp4, %cst_4 : tensor<64x1xi64, #blocked1> loc(#loc95) + %tmp4_42 = arith.cmpi slt, %tmp4_40, %cst_3 : tensor<64x1xi64, #blocked> loc(#loc95) + %tmp5 = arith.muli %y0, %cst_2 : tensor<64x1xi32, #blocked1> loc(#loc96) + %tmp5_43 = tt.broadcast %xindex_35 : tensor<1x64xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc97) + %tmp5_44 = tt.broadcast %tmp5 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc97) + %tmp5_45 = arith.addi %tmp5_43, %tmp5_44 : tensor<64x64xi32, #blocked1> loc(#loc97) + %tmp5_46 = arith.muli %y1, %cst_1 : tensor<64x1xi32, #blocked1> loc(#loc98) + %tmp5_47 = tt.broadcast %tmp5_46 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc99) + %tmp5_48 = arith.addi %tmp5_45, %tmp5_47 : tensor<64x64xi32, #blocked1> loc(#loc99) + %tmp5_49 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked1> loc(#loc100) + %tmp5_50 = tt.addptr %tmp5_49, %tmp5_48 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc100) + %tmp5_51 = tt.broadcast %tmp4_41 : tensor<64x1xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc101) + %tmp5_52 = tt.broadcast %tmp4_42 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc101) + %tmp5_53 = tt.broadcast %xmask : tensor<1x64xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc101) + %tmp5_54 = tt.broadcast %xmask_37 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc101) + %tmp5_55 = arith.andi %tmp5_51, %tmp5_53 : tensor<64x64xi1, #blocked1> loc(#loc101) + %tmp5_56 = arith.andi %tmp5_52, %tmp5_54 : tensor<64x64xi1, #blocked> loc(#loc101) + %tmp5_57 = tt.broadcast %ymask : tensor<64x1xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc102) + %tmp5_58 = tt.broadcast %ymask_28 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc102) + %tmp5_59 = arith.andi %tmp5_55, %tmp5_57 : tensor<64x64xi1, #blocked1> loc(#loc102) + %tmp5_60 = arith.andi %tmp5_56, %tmp5_58 : tensor<64x64xi1, #blocked> loc(#loc102) + %tmp5_61 = tt.load %tmp5_50, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked1> loc(#loc103) + %tmp5_62 = ttg.convert_layout %tmp5_61 : tensor<64x64xbf16, #blocked1> -> tensor<64x64xbf16, #blocked> loc(#loc104) + %tmp5_63 = arith.extf %tmp5_62 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc104) + %tmp7 = arith.muli %y1_38, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc105) + %tmp7_64 = arith.addi %y0_39, %tmp7 : tensor<64x1xi32, #blocked> loc(#loc106) + %tmp7_65 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc107) + %tmp7_66 = tt.addptr %tmp7_65, %tmp7_64 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc107) + %tmp7_67 = tt.broadcast %tmp7_66 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x64x!tt.ptr, #blocked> loc(#loc107) + %tmp7_68 = tt.load %tmp7_67, %tmp5_60, %cst_12 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked> loc(#loc108) + %tmp9 = arith.divf %tmp7_68, %cst_14 : tensor<64x64xf32, #blocked> loc(#loc109) + %tmp11 = arith.addf %tmp9, %cst_13 : tensor<64x64xf32, #blocked> loc(#loc110) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32, #blocked>) -> tensor<64x64xf32, #blocked> loc(#loc111) + %tmp13 = arith.mulf %tmp5_63, %tmp12 : tensor<64x64xf32, #blocked> loc(#loc112) + %tmp13_69 = ttg.convert_layout %tmp13 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #blocked1> loc(#loc112) + %tmp14 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc113) + %tmp14_70 = tt.addptr %tmp14, %xindex_35 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc113) + %tmp14_71 = tt.broadcast %tmp14_70 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<64x64x!tt.ptr, #blocked1> loc(#loc113) + %tmp14_72 = tt.load %tmp14_71, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked1> loc(#loc114) + %tmp14_73 = arith.extf %tmp14_72 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc115) + %tmp16 = arith.mulf %tmp13_69, %tmp14_73 : tensor<64x64xf32, #blocked1> loc(#loc116) + %tmp20 = arith.cmpi sge, %tmp4, %cst_4 : tensor<64x1xi64, #blocked1> loc(#loc117) + %tmp20_74 = arith.cmpi sge, %tmp4_40, %cst_3 : tensor<64x1xi64, #blocked> loc(#loc117) + %tmp23 = arith.addi %y1, %cst_0 : tensor<64x1xi32, #blocked1> loc(#loc118) + %tmp23_75 = arith.addi %y1_38, %cst : tensor<64x1xi32, #blocked> loc(#loc118) + %tmp23_76 = arith.muli %tmp23, %cst_1 : tensor<64x1xi32, #blocked1> loc(#loc119) + %tmp23_77 = tt.broadcast %tmp23_76 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc120) + %tmp23_78 = arith.addi %tmp5_45, %tmp23_77 : tensor<64x64xi32, #blocked1> loc(#loc120) + %tmp23_79 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked1> loc(#loc121) + %tmp23_80 = tt.addptr %tmp23_79, %tmp23_78 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc121) + %tmp23_81 = tt.broadcast %tmp20 : tensor<64x1xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc122) + %tmp23_82 = tt.broadcast %tmp20_74 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc122) + %tmp23_83 = arith.andi %tmp23_81, %tmp5_53 : tensor<64x64xi1, #blocked1> loc(#loc122) + %tmp23_84 = arith.andi %tmp23_82, %tmp5_54 : tensor<64x64xi1, #blocked> loc(#loc122) + %tmp23_85 = arith.andi %tmp23_83, %tmp5_57 : tensor<64x64xi1, #blocked1> loc(#loc123) + %tmp23_86 = arith.andi %tmp23_84, %tmp5_58 : tensor<64x64xi1, #blocked> loc(#loc123) + %tmp23_87 = tt.load %tmp23_80, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked1> loc(#loc124) + %tmp23_88 = ttg.convert_layout %tmp23_87 : tensor<64x64xbf16, #blocked1> -> tensor<64x64xbf16, #blocked> loc(#loc125) + %tmp23_89 = arith.extf %tmp23_88 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc125) + %tmp25 = arith.muli %tmp23_75, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc126) + %tmp25_90 = arith.addi %y0_39, %tmp25 : tensor<64x1xi32, #blocked> loc(#loc127) + %tmp25_91 = tt.splat %in_ptr4 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc128) + %tmp25_92 = tt.addptr %tmp25_91, %tmp25_90 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc128) + %tmp25_93 = tt.broadcast %tmp25_92 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x64x!tt.ptr, #blocked> loc(#loc128) + %tmp25_94 = tt.load %tmp25_93, %tmp23_86, %cst_12 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked> loc(#loc129) + %tmp27 = arith.divf %tmp25_94, %cst_14 : tensor<64x64xf32, #blocked> loc(#loc130) + %tmp29 = arith.addf %tmp27, %cst_13 : tensor<64x64xf32, #blocked> loc(#loc131) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32, #blocked>) -> tensor<64x64xf32, #blocked> loc(#loc132) + %tmp31 = arith.mulf %tmp23_89, %tmp30 : tensor<64x64xf32, #blocked> loc(#loc133) + %tmp31_95 = ttg.convert_layout %tmp31 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #blocked1> loc(#loc133) + %tmp32 = tt.splat %in_ptr5 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc134) + %tmp32_96 = tt.addptr %tmp32, %xindex_35 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc134) + %tmp32_97 = tt.broadcast %tmp32_96 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<64x64x!tt.ptr, #blocked1> loc(#loc134) + %tmp32_98 = tt.load %tmp32_97, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr, #blocked1> loc(#loc135) + %tmp32_99 = arith.extf %tmp32_98 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc136) + %tmp34 = arith.mulf %tmp31_95, %tmp32_99 : tensor<64x64xf32, #blocked1> loc(#loc137) + %tmp37 = arith.select %tmp23_81, %tmp34, %cst_15 : tensor<64x64xi1, #blocked1>, tensor<64x64xf32, #blocked1> loc(#loc138) + %tmp38 = arith.select %tmp5_51, %tmp16, %tmp37 : tensor<64x64xi1, #blocked1>, tensor<64x64xf32, #blocked1> loc(#loc141) + %0 = arith.muli %yindex_26, %cst_2 : tensor<64x1xi32, #blocked1> loc(#loc64) + %1 = tt.broadcast %0 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc65) + %2 = arith.addi %tmp5_43, %1 : tensor<64x64xi32, #blocked1> loc(#loc65) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked1> loc(#loc66) + %4 = tt.addptr %3, %2 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc66) + %5 = arith.andi %tmp5_53, %tmp5_57 : tensor<64x64xi1, #blocked1> loc(#loc67) + %6 = arith.truncf %tmp38 : tensor<64x64xf32, #blocked1> to tensor<64x64xbf16, #blocked1> loc(#loc68) + tt.store %4, %6, %5 : tensor<64x64x!tt.ptr, #blocked1> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc79 = loc("yoffset"(#loc2)) +#loc80 = loc("yoffset"(#loc3)) +#loc81 = loc("yoffset"(#loc4)) +#loc82 = loc("yoffset"(#loc5)) +#loc83 = loc("yoffset"(#loc6)) +#loc84 = loc("yoffset"(#loc7)) +#loc85 = loc("yindex"(#loc8)) +#loc86 = loc("yindex"(#loc9)) +#loc87 = loc("ymask"(#loc10)) +#loc88 = loc("xoffset"(#loc11)) +#loc89 = loc("xoffset"(#loc12)) +#loc90 = loc("xindex"(#loc13)) +#loc91 = loc("xindex"(#loc14)) +#loc92 = loc("xmask"(#loc15)) +#loc93 = loc("y1"(#loc16)) +#loc94 = loc("y0"(#loc17)) +#loc95 = loc("tmp4"(#loc18)) +#loc96 = loc("tmp5"(#loc19)) +#loc97 = loc("tmp5"(#loc20)) +#loc98 = loc("tmp5"(#loc21)) +#loc99 = loc("tmp5"(#loc22)) +#loc100 = loc("tmp5"(#loc23)) +#loc101 = loc("tmp5"(#loc24)) +#loc102 = loc("tmp5"(#loc25)) +#loc103 = loc("tmp5"(#loc26)) +#loc104 = loc("tmp5"(#loc27)) +#loc105 = loc("tmp7"(#loc28)) +#loc106 = loc("tmp7"(#loc29)) +#loc107 = loc("tmp7"(#loc30)) +#loc108 = loc("tmp7"(#loc31)) +#loc109 = loc("tmp9"(#loc32)) +#loc110 = loc("tmp11"(#loc33)) +#loc111 = loc("tmp12"(#loc34)) +#loc112 = loc("tmp13"(#loc35)) +#loc113 = loc("tmp14"(#loc36)) +#loc114 = loc("tmp14"(#loc37)) +#loc115 = loc("tmp14"(#loc38)) +#loc116 = loc("tmp16"(#loc39)) +#loc117 = loc("tmp20"(#loc40)) +#loc118 = loc("tmp23"(#loc41)) +#loc119 = loc("tmp23"(#loc42)) +#loc120 = loc("tmp23"(#loc43)) +#loc121 = loc("tmp23"(#loc44)) +#loc122 = loc("tmp23"(#loc45)) +#loc123 = loc("tmp23"(#loc46)) +#loc124 = loc("tmp23"(#loc47)) +#loc125 = loc("tmp23"(#loc48)) +#loc126 = loc("tmp25"(#loc49)) +#loc127 = loc("tmp25"(#loc50)) +#loc128 = loc("tmp25"(#loc51)) +#loc129 = loc("tmp25"(#loc52)) +#loc130 = loc("tmp27"(#loc53)) +#loc131 = loc("tmp29"(#loc54)) +#loc132 = loc("tmp30"(#loc55)) +#loc133 = loc("tmp31"(#loc56)) +#loc134 = loc("tmp32"(#loc57)) +#loc135 = loc("tmp32"(#loc58)) +#loc136 = loc("tmp32"(#loc59)) +#loc137 = loc("tmp34"(#loc60)) +#loc138 = loc("tmp37"(#loc61)) +#loc139 = loc("tmp38"(#loc62)) +#loc140 = loc("tmp19"(#loc63)) +#loc141 = loc(fused[#loc139, #loc140]) diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..cac973030c96375fbf414a39a695eb455cb30e8c --- /dev/null +++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir @@ -0,0 +1,252 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc71 = loc("in_ptr0"(#loc)) +#loc72 = loc("in_ptr1"(#loc)) +#loc73 = loc("in_ptr2"(#loc)) +#loc74 = loc("in_ptr3"(#loc)) +#loc75 = loc("in_ptr4"(#loc)) +#loc76 = loc("in_ptr5"(#loc)) +#loc77 = loc("out_ptr0"(#loc)) +#loc78 = loc("ynumel"(#loc)) +#loc79 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1) + %cst_0 = arith.constant dense<-256> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<9.99999997E-7> : tensor<64x64xf32> loc(#loc1) + %cst_2 = arith.constant dense<1.280000e+02> : tensor<64x64xf32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1) + %cst_4 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1) + %cst_5 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_6 = arith.constant dense<256> : tensor<64x1xi64> loc(#loc1) + %cst_7 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1) + %xmask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc80) + %ymask = arith.constant dense<73728> : tensor<64x1xi32> loc(#loc81) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %yoffset = tt.get_program_id y : i32 loc(#loc82) + %yoffset_8 = tt.get_program_id z : i32 loc(#loc83) + %yoffset_9 = tt.get_num_programs y : i32 loc(#loc84) + %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc85) + %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc86) + %yoffset_12 = arith.muli %yoffset_11, %c64_i32 : i32 loc(#loc87) + %yindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc88) + %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc89) + %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<64x1xi32> loc(#loc90) + %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<64x1xi32> loc(#loc90) + %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<64x1xi32> loc(#loc81) + %xoffset = tt.get_program_id x : i32 loc(#loc91) + %xoffset_17 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc92) + %xindex = tt.expand_dims %yindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc93) + %xindex_18 = tt.splat %xoffset_17 : i32 -> tensor<1x64xi32> loc(#loc94) + %xindex_19 = arith.addi %xindex_18, %xindex : tensor<1x64xi32> loc(#loc94) + %xmask_20 = arith.cmpi slt, %xindex_19, %xmask : tensor<1x64xi32> loc(#loc80) + %y1 = arith.divsi %yindex_15, %cst_7 : tensor<64x1xi32> loc(#loc95) + %y0 = arith.remsi %yindex_15, %cst_7 : tensor<64x1xi32> loc(#loc96) + %tmp4 = arith.extsi %y1 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc97) + %tmp4_21 = arith.cmpi slt, %tmp4, %cst_6 : tensor<64x1xi64> loc(#loc97) + %tmp5 = arith.muli %y0, %cst_5 : tensor<64x1xi32> loc(#loc98) + %tmp5_22 = tt.broadcast %xindex_19 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc99) + %tmp5_23 = tt.broadcast %tmp5 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc99) + %tmp5_24 = arith.addi %tmp5_22, %tmp5_23 : tensor<64x64xi32> loc(#loc99) + %tmp5_25 = arith.muli %y1, %cst_4 : tensor<64x1xi32> loc(#loc100) + %tmp5_26 = tt.broadcast %tmp5_25 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc101) + %tmp5_27 = arith.addi %tmp5_24, %tmp5_26 : tensor<64x64xi32> loc(#loc101) + %tmp5_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc102) + %tmp5_29 = tt.addptr %tmp5_28, %tmp5_27 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc102) + %tmp5_30 = tt.broadcast %tmp4_21 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc103) + %tmp5_31 = tt.broadcast %xmask_20 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc103) + %tmp5_32 = arith.andi %tmp5_30, %tmp5_31 : tensor<64x64xi1> loc(#loc103) + %tmp5_33 = tt.broadcast %ymask_16 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc104) + %tmp5_34 = arith.andi %tmp5_32, %tmp5_33 : tensor<64x64xi1> loc(#loc104) + %tmp5_35 = tt.load %tmp5_29, %tmp5_34, %cst evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc105) + %tmp5_36 = arith.extf %tmp5_35 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc106) + %tmp7 = arith.muli %y1, %cst_7 : tensor<64x1xi32> loc(#loc107) + %tmp7_37 = arith.addi %y0, %tmp7 : tensor<64x1xi32> loc(#loc108) + %tmp7_38 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc109) + %tmp7_39 = tt.addptr %tmp7_38, %tmp7_37 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc109) + %tmp7_40 = tt.broadcast %tmp7_39 : tensor<64x1x!tt.ptr> -> tensor<64x64x!tt.ptr> loc(#loc109) + %tmp7_41 = tt.load %tmp7_40, %tmp5_34, %cst_3 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc110) + %tmp9 = arith.divf %tmp7_41, %cst_2 : tensor<64x64xf32> loc(#loc111) + %tmp11 = arith.addf %tmp9, %cst_1 : tensor<64x64xf32> loc(#loc112) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32>) -> tensor<64x64xf32> loc(#loc113) + %tmp13 = arith.mulf %tmp5_36, %tmp12 : tensor<64x64xf32> loc(#loc114) + %tmp14 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc115) + %tmp14_42 = tt.addptr %tmp14, %xindex_19 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc115) + %tmp14_43 = tt.broadcast %tmp14_42 : tensor<1x64x!tt.ptr> -> tensor<64x64x!tt.ptr> loc(#loc115) + %tmp14_44 = tt.load %tmp14_43, %tmp5_34, %cst evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc116) + %tmp14_45 = arith.extf %tmp14_44 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc117) + %tmp16 = arith.mulf %tmp13, %tmp14_45 : tensor<64x64xf32> loc(#loc118) + %tmp19 = arith.select %tmp5_30, %tmp16, %cst_3 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc119) + %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<64x1xi64> loc(#loc120) + %tmp23 = arith.addi %y1, %cst_0 : tensor<64x1xi32> loc(#loc121) + %tmp23_46 = arith.muli %tmp23, %cst_4 : tensor<64x1xi32> loc(#loc122) + %tmp23_47 = tt.broadcast %tmp23_46 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc123) + %tmp23_48 = arith.addi %tmp5_24, %tmp23_47 : tensor<64x64xi32> loc(#loc123) + %tmp23_49 = tt.splat %in_ptr3 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc124) + %tmp23_50 = tt.addptr %tmp23_49, %tmp23_48 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc124) + %tmp23_51 = tt.broadcast %tmp20 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc125) + %tmp23_52 = arith.andi %tmp23_51, %tmp5_31 : tensor<64x64xi1> loc(#loc125) + %tmp23_53 = arith.andi %tmp23_52, %tmp5_33 : tensor<64x64xi1> loc(#loc126) + %tmp23_54 = tt.load %tmp23_50, %tmp23_53, %cst evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc127) + %tmp23_55 = arith.extf %tmp23_54 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc128) + %tmp25 = arith.muli %tmp23, %cst_7 : tensor<64x1xi32> loc(#loc129) + %tmp25_56 = arith.addi %y0, %tmp25 : tensor<64x1xi32> loc(#loc130) + %tmp25_57 = tt.splat %in_ptr4 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc131) + %tmp25_58 = tt.addptr %tmp25_57, %tmp25_56 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc131) + %tmp25_59 = tt.broadcast %tmp25_58 : tensor<64x1x!tt.ptr> -> tensor<64x64x!tt.ptr> loc(#loc131) + %tmp25_60 = tt.load %tmp25_59, %tmp23_53, %cst_3 evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc132) + %tmp27 = arith.divf %tmp25_60, %cst_2 : tensor<64x64xf32> loc(#loc133) + %tmp29 = arith.addf %tmp27, %cst_1 : tensor<64x64xf32> loc(#loc134) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32>) -> tensor<64x64xf32> loc(#loc135) + %tmp31 = arith.mulf %tmp23_55, %tmp30 : tensor<64x64xf32> loc(#loc136) + %tmp32 = tt.splat %in_ptr5 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc137) + %tmp32_61 = tt.addptr %tmp32, %xindex_19 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc137) + %tmp32_62 = tt.broadcast %tmp32_61 : tensor<1x64x!tt.ptr> -> tensor<64x64x!tt.ptr> loc(#loc137) + %tmp32_63 = tt.load %tmp32_62, %tmp23_53, %cst evictionPolicy = evict_last : tensor<64x64x!tt.ptr> loc(#loc138) + %tmp32_64 = arith.extf %tmp32_63 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc139) + %tmp34 = arith.mulf %tmp31, %tmp32_64 : tensor<64x64xf32> loc(#loc140) + %tmp37 = arith.select %tmp23_51, %tmp34, %cst_3 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc141) + %tmp38 = arith.select %tmp5_30, %tmp19, %tmp37 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc142) + %0 = arith.muli %yindex_15, %cst_5 : tensor<64x1xi32> loc(#loc65) + %1 = tt.broadcast %0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc66) + %2 = arith.addi %tmp5_22, %1 : tensor<64x64xi32> loc(#loc66) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc67) + %4 = tt.addptr %3, %2 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc67) + %5 = arith.andi %tmp5_31, %tmp5_33 : tensor<64x64xi1> loc(#loc68) + %6 = arith.truncf %tmp38 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc69) + tt.store %4, %6, %5 : tensor<64x64x!tt.ptr> loc(#loc69) + tt.return loc(#loc70) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc80 = loc("xmask"(#loc2)) +#loc81 = loc("ymask"(#loc3)) +#loc82 = loc("yoffset"(#loc4)) +#loc83 = loc("yoffset"(#loc5)) +#loc84 = loc("yoffset"(#loc6)) +#loc85 = loc("yoffset"(#loc7)) +#loc86 = loc("yoffset"(#loc8)) +#loc87 = loc("yoffset"(#loc9)) +#loc88 = loc("yindex"(#loc10)) +#loc89 = loc("yindex"(#loc11)) +#loc90 = loc("yindex"(#loc12)) +#loc91 = loc("xoffset"(#loc13)) +#loc92 = loc("xoffset"(#loc14)) +#loc93 = loc("xindex"(#loc15)) +#loc94 = loc("xindex"(#loc16)) +#loc95 = loc("y1"(#loc17)) +#loc96 = loc("y0"(#loc18)) +#loc97 = loc("tmp4"(#loc19)) +#loc98 = loc("tmp5"(#loc20)) +#loc99 = loc("tmp5"(#loc21)) +#loc100 = loc("tmp5"(#loc22)) +#loc101 = loc("tmp5"(#loc23)) +#loc102 = loc("tmp5"(#loc24)) +#loc103 = loc("tmp5"(#loc25)) +#loc104 = loc("tmp5"(#loc26)) +#loc105 = loc("tmp5"(#loc27)) +#loc106 = loc("tmp5"(#loc28)) +#loc107 = loc("tmp7"(#loc29)) +#loc108 = loc("tmp7"(#loc30)) +#loc109 = loc("tmp7"(#loc31)) +#loc110 = loc("tmp7"(#loc32)) +#loc111 = loc("tmp9"(#loc33)) +#loc112 = loc("tmp11"(#loc34)) +#loc113 = loc("tmp12"(#loc35)) +#loc114 = loc("tmp13"(#loc36)) +#loc115 = loc("tmp14"(#loc37)) +#loc116 = loc("tmp14"(#loc38)) +#loc117 = loc("tmp14"(#loc39)) +#loc118 = loc("tmp16"(#loc40)) +#loc119 = loc("tmp19"(#loc41)) +#loc120 = loc("tmp20"(#loc42)) +#loc121 = loc("tmp23"(#loc43)) +#loc122 = loc("tmp23"(#loc44)) +#loc123 = loc("tmp23"(#loc45)) +#loc124 = loc("tmp23"(#loc46)) +#loc125 = loc("tmp23"(#loc47)) +#loc126 = loc("tmp23"(#loc48)) +#loc127 = loc("tmp23"(#loc49)) +#loc128 = loc("tmp23"(#loc50)) +#loc129 = loc("tmp25"(#loc51)) +#loc130 = loc("tmp25"(#loc52)) +#loc131 = loc("tmp25"(#loc53)) +#loc132 = loc("tmp25"(#loc54)) +#loc133 = loc("tmp27"(#loc55)) +#loc134 = loc("tmp29"(#loc56)) +#loc135 = loc("tmp30"(#loc57)) +#loc136 = loc("tmp31"(#loc58)) +#loc137 = loc("tmp32"(#loc59)) +#loc138 = loc("tmp32"(#loc60)) +#loc139 = loc("tmp32"(#loc61)) +#loc140 = loc("tmp34"(#loc62)) +#loc141 = loc("tmp37"(#loc63)) +#loc142 = loc("tmp38"(#loc64)) diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..edd2db01880eb9998e9b112829861fe28f144dbf --- /dev/null +++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.json"}} \ No newline at end of file diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..961c6a7bdbc02e331d11ca186da91efe70a1fc3d Binary files /dev/null and b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.cubin differ diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e6e70c01ff7d9e4072991fcd6ad9e922ea68a17f --- /dev/null +++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"hash": "9773cffdbdd0092f04505c242815e8708ad1ef44dbf578a3ce90af91077a0ba5", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"} \ No newline at end of file diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..bf1cf2cba787c5bc98d995d084f32746a623f725 --- /dev/null +++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.llir @@ -0,0 +1,565 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 { +__nv_rsqrtf.exit: + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %9 = icmp samesign ult i32 %8, 2304, !dbg !9 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %11 = shl nuw nsw i32 %10, 2, !dbg !10 + %12 = and i32 %11, 2044, !dbg !10 + %13 = shl i32 %8, 12, !dbg !11 + %14 = or disjoint i32 %12, %13 + %15 = sext i32 %14 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13 + %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %18 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %16, i64 %17, i1 %9) #6, !dbg !14 + %19 = extractvalue { i32, i32 } %18, 1, !dbg !14 + %20 = bitcast i32 %19 to <2 x bfloat>, !dbg !14 + %21 = extractelement <2 x bfloat> %20, i64 1, !dbg !14 + %22 = fpext bfloat %21 to float, !dbg !15 + %23 = extractelement <2 x bfloat> %20, i64 0, !dbg !14 + %24 = fpext bfloat %23 to float, !dbg !15 + %25 = extractvalue { i32, i32 } %18, 0, !dbg !14 + %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !14 + %27 = extractelement <2 x bfloat> %26, i64 1, !dbg !14 + %28 = fpext bfloat %27 to float, !dbg !15 + %29 = extractelement <2 x bfloat> %26, i64 0, !dbg !14 + %30 = fpext bfloat %29 to float, !dbg !15 + %31 = select i1 %9, float %30, float 0.000000e+00, !dbg !16 + %32 = select i1 %9, float %28, float 0.000000e+00, !dbg !16 + %33 = select i1 %9, float %24, float 0.000000e+00, !dbg !16 + %34 = select i1 %9, float %22, float 0.000000e+00, !dbg !16 + %35 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13 + %36 = getelementptr i8, ptr addrspace(1) %35, i64 4096, !dbg !13 + %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %36, i64 %37, i1 %9) #6, !dbg !14 + %39 = extractvalue { i32, i32 } %38, 0, !dbg !14 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !14 + %41 = extractelement <2 x bfloat> %40, i64 0, !dbg !14 + %42 = fpext bfloat %41 to float, !dbg !15 + %43 = fsub float %42, %31, !dbg !17 + %44 = select i1 %9, float 2.000000e+00, float 1.000000e+00, !dbg !22 + %45 = tail call float @llvm.nvvm.div.full(float %43, float %44), !dbg !23 + %46 = fadd float %31, %45, !dbg !24 + %47 = fsub float %42, %46, !dbg !25 + %48 = fmul float %43, %47, !dbg !26 + %49 = fadd float %48, 0.000000e+00, !dbg !27 + %50 = extractelement <2 x bfloat> %40, i64 1, !dbg !14 + %51 = fpext bfloat %50 to float, !dbg !15 + %52 = fsub float %51, %32, !dbg !17 + %53 = tail call float @llvm.nvvm.div.full(float %52, float %44), !dbg !23 + %54 = fadd float %32, %53, !dbg !24 + %55 = fsub float %51, %54, !dbg !25 + %56 = fmul float %52, %55, !dbg !26 + %57 = fadd float %56, 0.000000e+00, !dbg !27 + %58 = extractvalue { i32, i32 } %38, 1, !dbg !14 + %59 = bitcast i32 %58 to <2 x bfloat>, !dbg !14 + %60 = extractelement <2 x bfloat> %59, i64 0, !dbg !14 + %61 = fpext bfloat %60 to float, !dbg !15 + %62 = fsub float %61, %33, !dbg !17 + %63 = tail call float @llvm.nvvm.div.full(float %62, float %44), !dbg !23 + %64 = fadd float %33, %63, !dbg !24 + %65 = fsub float %61, %64, !dbg !25 + %66 = fmul float %62, %65, !dbg !26 + %67 = fadd float %66, 0.000000e+00, !dbg !27 + %68 = extractelement <2 x bfloat> %59, i64 1, !dbg !14 + %69 = fpext bfloat %68 to float, !dbg !15 + %70 = fsub float %69, %34, !dbg !17 + %71 = tail call float @llvm.nvvm.div.full(float %70, float %44), !dbg !23 + %72 = fadd float %34, %71, !dbg !24 + %73 = fsub float %69, %72, !dbg !25 + %74 = fmul float %70, %73, !dbg !26 + %75 = fadd float %74, 0.000000e+00, !dbg !27 + %76 = select i1 %9, float %46, float 0.000000e+00, !dbg !16 + %77 = select i1 %9, float %54, float 0.000000e+00, !dbg !16 + %78 = select i1 %9, float %64, float 0.000000e+00, !dbg !16 + %79 = select i1 %9, float %72, float 0.000000e+00, !dbg !16 + %80 = select i1 %9, float %67, float 0.000000e+00, !dbg !28 + %81 = select i1 %9, float %75, float 0.000000e+00, !dbg !28 + %82 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %83 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %84 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %85 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22 + %86 = and i32 %10, 511, !dbg !10 + %87 = and i32 %10, 31, !dbg !10 + %88 = lshr i32 %86, 5, !dbg !10 + %89 = fsub float %77, %76, !dbg !29 + %90 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !32 + %91 = fcmp oeq float %90, 0.000000e+00, !dbg !33 + %92 = tail call float @llvm.nvvm.div.full(float %83, float %90), !dbg !34 + %93 = select i1 %91, float 0.000000e+00, float %92, !dbg !35 + %94 = fmul float %89, %93, !dbg !36 + %95 = fadd float %76, %94, !dbg !37 + %96 = fadd float %49, %57, !dbg !38 + %97 = select i1 %9, float %96, float 0.000000e+00, !dbg !38 + %98 = fmul float %89, %89, !dbg !39 + %99 = fmul float %98, %82, !dbg !40 + %100 = fmul float %99, %93, !dbg !41 + %101 = fadd float %97, %100, !dbg !42 + %102 = fsub float %78, %95, !dbg !29 + %103 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !32 + %104 = fcmp oeq float %103, 0.000000e+00, !dbg !33 + %105 = tail call float @llvm.nvvm.div.full(float %84, float %103), !dbg !34 + %106 = select i1 %104, float 0.000000e+00, float %105, !dbg !35 + %107 = fmul float %106, %102, !dbg !36 + %108 = fadd float %95, %107, !dbg !37 + %109 = fadd float %80, %101, !dbg !38 + %110 = fmul float %102, %102, !dbg !39 + %111 = fmul float %90, %110, !dbg !40 + %112 = fmul float %106, %111, !dbg !41 + %113 = fadd float %109, %112, !dbg !42 + %114 = fsub float %79, %108, !dbg !29 + %115 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !32 + %116 = fcmp oeq float %115, 0.000000e+00, !dbg !33 + %117 = tail call float @llvm.nvvm.div.full(float %85, float %115), !dbg !34 + %118 = select i1 %116, float 0.000000e+00, float %117, !dbg !35 + %119 = fmul float %118, %114, !dbg !36 + %120 = fadd float %108, %119, !dbg !37 + %121 = fadd float %81, %113, !dbg !38 + %122 = fmul float %114, %114, !dbg !39 + %123 = fmul float %103, %122, !dbg !40 + %124 = fmul float %118, %123, !dbg !41 + %125 = fadd float %121, %124, !dbg !42 + %126 = bitcast float %120 to i32, !dbg !30 + %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 16, i32 31), !dbg !30 + %128 = bitcast i32 %127 to float, !dbg !30 + %129 = bitcast float %125 to i32, !dbg !30 + %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 16, i32 31), !dbg !30 + %131 = bitcast i32 %130 to float, !dbg !30 + %132 = bitcast float %115 to i32, !dbg !30 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !30 + %134 = bitcast i32 %133 to float, !dbg !30 + %135 = fsub float %128, %120, !dbg !29 + %136 = fadd float %115, %134, !dbg !32 + %137 = fcmp oeq float %136, 0.000000e+00, !dbg !33 + %138 = tail call float @llvm.nvvm.div.full(float %134, float %136), !dbg !34 + %139 = select i1 %137, float 0.000000e+00, float %138, !dbg !35 + %140 = fmul float %139, %135, !dbg !36 + %141 = fadd float %120, %140, !dbg !37 + %142 = fadd float %125, %131, !dbg !38 + %143 = fmul float %135, %135, !dbg !39 + %144 = fmul float %115, %143, !dbg !40 + %145 = fmul float %139, %144, !dbg !41 + %146 = fadd float %142, %145, !dbg !42 + %147 = bitcast float %141 to i32, !dbg !30 + %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %147, i32 8, i32 31), !dbg !30 + %149 = bitcast i32 %148 to float, !dbg !30 + %150 = bitcast float %146 to i32, !dbg !30 + %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 8, i32 31), !dbg !30 + %152 = bitcast i32 %151 to float, !dbg !30 + %153 = bitcast float %136 to i32, !dbg !30 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !30 + %155 = bitcast i32 %154 to float, !dbg !30 + %156 = fsub float %149, %141, !dbg !29 + %157 = fadd float %136, %155, !dbg !32 + %158 = fcmp oeq float %157, 0.000000e+00, !dbg !33 + %159 = tail call float @llvm.nvvm.div.full(float %155, float %157), !dbg !34 + %160 = select i1 %158, float 0.000000e+00, float %159, !dbg !35 + %161 = fmul float %156, %160, !dbg !36 + %162 = fadd float %141, %161, !dbg !37 + %163 = fadd float %146, %152, !dbg !38 + %164 = fmul float %156, %156, !dbg !39 + %165 = fmul float %136, %164, !dbg !40 + %166 = fmul float %160, %165, !dbg !41 + %167 = fadd float %163, %166, !dbg !42 + %168 = bitcast float %162 to i32, !dbg !30 + %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 4, i32 31), !dbg !30 + %170 = bitcast i32 %169 to float, !dbg !30 + %171 = bitcast float %167 to i32, !dbg !30 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 4, i32 31), !dbg !30 + %173 = bitcast i32 %172 to float, !dbg !30 + %174 = bitcast float %157 to i32, !dbg !30 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !30 + %176 = bitcast i32 %175 to float, !dbg !30 + %177 = fsub float %170, %162, !dbg !29 + %178 = fadd float %157, %176, !dbg !32 + %179 = fcmp oeq float %178, 0.000000e+00, !dbg !33 + %180 = tail call float @llvm.nvvm.div.full(float %176, float %178), !dbg !34 + %181 = select i1 %179, float 0.000000e+00, float %180, !dbg !35 + %182 = fmul float %177, %181, !dbg !36 + %183 = fadd float %162, %182, !dbg !37 + %184 = fadd float %167, %173, !dbg !38 + %185 = fmul float %177, %177, !dbg !39 + %186 = fmul float %157, %185, !dbg !40 + %187 = fmul float %181, %186, !dbg !41 + %188 = fadd float %184, %187, !dbg !42 + %189 = bitcast float %183 to i32, !dbg !30 + %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 2, i32 31), !dbg !30 + %191 = bitcast i32 %190 to float, !dbg !30 + %192 = bitcast float %188 to i32, !dbg !30 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 2, i32 31), !dbg !30 + %194 = bitcast i32 %193 to float, !dbg !30 + %195 = bitcast float %178 to i32, !dbg !30 + %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !30 + %197 = bitcast i32 %196 to float, !dbg !30 + %198 = fsub float %191, %183, !dbg !29 + %199 = fadd float %178, %197, !dbg !32 + %200 = fcmp oeq float %199, 0.000000e+00, !dbg !33 + %201 = tail call float @llvm.nvvm.div.full(float %197, float %199), !dbg !34 + %202 = select i1 %200, float 0.000000e+00, float %201, !dbg !35 + %203 = fmul float %198, %202, !dbg !36 + %204 = fadd float %183, %203, !dbg !37 + %205 = fadd float %188, %194, !dbg !38 + %206 = fmul float %198, %198, !dbg !39 + %207 = fmul float %178, %206, !dbg !40 + %208 = fmul float %202, %207, !dbg !41 + %209 = fadd float %205, %208, !dbg !42 + %210 = bitcast float %204 to i32, !dbg !30 + %211 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %210, i32 1, i32 31), !dbg !30 + %212 = bitcast i32 %211 to float, !dbg !30 + %213 = bitcast float %209 to i32, !dbg !30 + %214 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !30 + %215 = bitcast i32 %214 to float, !dbg !30 + %216 = bitcast float %199 to i32, !dbg !30 + %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !30 + %218 = bitcast i32 %217 to float, !dbg !30 + %219 = fsub float %212, %204, !dbg !29 + %220 = fadd float %199, %218, !dbg !32 + %221 = fcmp oeq float %220, 0.000000e+00, !dbg !33 + %222 = tail call float @llvm.nvvm.div.full(float %218, float %220), !dbg !34 + %223 = select i1 %221, float 0.000000e+00, float %222, !dbg !35 + %224 = fmul float %219, %223, !dbg !36 + %225 = fadd float %204, %224, !dbg !37 + %226 = fadd float %209, %215, !dbg !38 + %227 = fmul float %219, %219, !dbg !39 + %228 = fmul float %199, %227, !dbg !40 + %229 = fmul float %223, %228, !dbg !41 + %230 = fadd float %226, %229, !dbg !42 + %231 = icmp eq i32 %87, 0, !dbg !30 + %232 = getelementptr float, ptr addrspace(3) @global_smem, i32 %88, !dbg !30 + %233 = bitcast float %225 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %232, <1 x i32> %233, i1 %231) #6, !dbg !30 + %234 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %88, !dbg !30 + %235 = bitcast float %230 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %234, <1 x i32> %235, i1 %231) #6, !dbg !30 + %236 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %88, !dbg !30 + %237 = bitcast float %220 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %236, <1 x i32> %237, i1 %231) #6, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30 + %238 = icmp samesign ult i32 %86, 16, !dbg !30 + %239 = getelementptr float, ptr addrspace(3) @global_smem, i32 %86, !dbg !30 + %240 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %239, i1 %238) #6, !dbg !30 + %241 = bitcast i32 %240 to float, !dbg !30 + %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %86, !dbg !30 + %243 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %242, i1 %238) #6, !dbg !30 + %244 = bitcast i32 %243 to float, !dbg !30 + %245 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %86, !dbg !30 + %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %238) #6, !dbg !30 + %247 = bitcast i32 %246 to float, !dbg !30 + %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 8, i32 31), !dbg !30 + %249 = bitcast i32 %248 to float, !dbg !30 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 8, i32 31), !dbg !30 + %251 = bitcast i32 %250 to float, !dbg !30 + %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !30 + %253 = bitcast i32 %252 to float, !dbg !30 + %254 = fsub float %249, %241, !dbg !29 + %255 = fadd float %247, %253, !dbg !32 + %256 = fcmp oeq float %255, 0.000000e+00, !dbg !33 + %257 = tail call float @llvm.nvvm.div.full(float %253, float %255), !dbg !34 + %258 = select i1 %256, float 0.000000e+00, float %257, !dbg !35 + %259 = fmul float %254, %258, !dbg !36 + %260 = fadd float %259, %241, !dbg !37 + %261 = fadd float %244, %251, !dbg !38 + %262 = fmul float %254, %254, !dbg !39 + %263 = fmul float %262, %247, !dbg !40 + %264 = fmul float %263, %258, !dbg !41 + %265 = fadd float %261, %264, !dbg !42 + %266 = bitcast float %260 to i32, !dbg !30 + %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 4, i32 31), !dbg !30 + %268 = bitcast i32 %267 to float, !dbg !30 + %269 = bitcast float %265 to i32, !dbg !30 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !30 + %271 = bitcast i32 %270 to float, !dbg !30 + %272 = bitcast float %255 to i32, !dbg !30 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !30 + %274 = bitcast i32 %273 to float, !dbg !30 + %275 = fsub float %268, %260, !dbg !29 + %276 = fadd float %255, %274, !dbg !32 + %277 = fcmp oeq float %276, 0.000000e+00, !dbg !33 + %278 = tail call float @llvm.nvvm.div.full(float %274, float %276), !dbg !34 + %279 = select i1 %277, float 0.000000e+00, float %278, !dbg !35 + %280 = fmul float %275, %279, !dbg !36 + %281 = fadd float %260, %280, !dbg !37 + %282 = fadd float %265, %271, !dbg !38 + %283 = fmul float %275, %275, !dbg !39 + %284 = fmul float %255, %283, !dbg !40 + %285 = fmul float %279, %284, !dbg !41 + %286 = fadd float %282, %285, !dbg !42 + %287 = bitcast float %281 to i32, !dbg !30 + %288 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %287, i32 2, i32 31), !dbg !30 + %289 = bitcast i32 %288 to float, !dbg !30 + %290 = bitcast float %286 to i32, !dbg !30 + %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 2, i32 31), !dbg !30 + %292 = bitcast i32 %291 to float, !dbg !30 + %293 = bitcast float %276 to i32, !dbg !30 + %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !30 + %295 = bitcast i32 %294 to float, !dbg !30 + %296 = fsub float %289, %281, !dbg !29 + %297 = fadd float %276, %295, !dbg !32 + %298 = fcmp oeq float %297, 0.000000e+00, !dbg !33 + %299 = tail call float @llvm.nvvm.div.full(float %295, float %297), !dbg !34 + %300 = select i1 %298, float 0.000000e+00, float %299, !dbg !35 + %301 = fmul float %296, %300, !dbg !36 + %302 = fadd float %281, %301, !dbg !37 + %303 = fadd float %286, %292, !dbg !38 + %304 = fmul float %296, %296, !dbg !39 + %305 = fmul float %276, %304, !dbg !40 + %306 = fmul float %300, %305, !dbg !41 + %307 = fadd float %303, %306, !dbg !42 + %308 = bitcast float %302 to i32, !dbg !30 + %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 1, i32 31), !dbg !30 + %310 = bitcast i32 %309 to float, !dbg !30 + %311 = bitcast float %307 to i32, !dbg !30 + %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 1, i32 31), !dbg !30 + %313 = bitcast i32 %312 to float, !dbg !30 + %314 = bitcast float %297 to i32, !dbg !30 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !30 + %316 = bitcast i32 %315 to float, !dbg !30 + %317 = fsub float %310, %302, !dbg !29 + %318 = fadd float %297, %316, !dbg !32 + %319 = fcmp oeq float %318, 0.000000e+00, !dbg !33 + %320 = tail call float @llvm.nvvm.div.full(float %316, float %318), !dbg !34 + %321 = select i1 %319, float 0.000000e+00, float %320, !dbg !35 + %322 = fmul float %317, %321, !dbg !36 + %323 = fadd float %302, %322, !dbg !37 + %324 = fadd float %307, %313, !dbg !38 + %325 = fmul float %317, %317, !dbg !39 + %326 = fmul float %297, %325, !dbg !40 + %327 = fmul float %321, %326, !dbg !41 + %328 = fadd float %324, %327, !dbg !42 + %329 = and i32 %10, 15, !dbg !30 + %330 = icmp eq i32 %329, 0, !dbg !30 + %331 = and i1 %238, %330, !dbg !30 + %332 = bitcast float %323 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %239, <1 x i32> %332, i1 %331) #6, !dbg !30 + %333 = bitcast float %328 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %333, i1 %331) #6, !dbg !30 + %334 = bitcast float %318 to <1 x i32>, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %334, i1 %331) #6, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30 + %335 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !30 + %336 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !30 + %337 = tail call float @llvm.nvvm.div.full(float %336, float 4.096000e+03), !dbg !43 + %338 = fadd float %337, 0x3EB0C6F7A0000000, !dbg !44 + %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i15 = icmp eq i32 %342, 0, !dbg !45 + br i1 %.not.i15, label %345, label %343, !dbg !45 + +343: ; preds = %__nv_rsqrtf.exit + %344 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !45 + br label %__nv_rsqrtf.exit17, !dbg !45 + +345: ; preds = %__nv_rsqrtf.exit + %346 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !45 + br label %__nv_rsqrtf.exit17, !dbg !45 + +__nv_rsqrtf.exit17: ; preds = %343, %345 + %.0.i16 = phi float [ %344, %343 ], [ %346, %345 ], !dbg !45 + %347 = zext nneg i32 %12 to i64, !dbg !46 + %348 = sext i32 %13 to i64, !dbg !46 + %349 = getelementptr bfloat, ptr addrspace(1) %1, i64 %347, !dbg !47 + %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48 + %351 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !48 + %352 = extractvalue { i32, i32 } %351, 0, !dbg !48 + %353 = bitcast i32 %352 to <2 x bfloat>, !dbg !48 + %354 = extractvalue { i32, i32 } %351, 1, !dbg !48 + %355 = bitcast i32 %354 to <2 x bfloat>, !dbg !48 + %356 = or disjoint i64 %347, %348, !dbg !49 + %357 = getelementptr bfloat, ptr addrspace(1) %0, i64 %356, !dbg !50 + %358 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %359 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %357, i64 %358, i1 %9) #6, !dbg !51 + %360 = extractvalue { i32, i32 } %359, 0, !dbg !51 + %361 = bitcast i32 %360 to <2 x bfloat>, !dbg !51 + %362 = extractvalue { i32, i32 } %359, 1, !dbg !51 + %363 = bitcast i32 %362 to <2 x bfloat>, !dbg !51 + %364 = getelementptr bfloat, ptr addrspace(1) %2, i64 %347, !dbg !52 + %365 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53 + %366 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %364, i64 %365, i1 true) #6, !dbg !53 + %367 = extractvalue { i32, i32 } %366, 0, !dbg !53 + %368 = bitcast i32 %367 to <2 x bfloat>, !dbg !53 + %369 = extractvalue { i32, i32 } %366, 1, !dbg !53 + %370 = bitcast i32 %369 to <2 x bfloat>, !dbg !53 + %371 = getelementptr bfloat, ptr addrspace(1) %3, i64 %356, !dbg !54 + %372 = fpext <2 x bfloat> %353 to <2 x float>, !dbg !55 + %373 = fpext <2 x bfloat> %361 to <2 x float>, !dbg !56 + %374 = fpext <2 x bfloat> %368 to <2 x float>, !dbg !57 + %375 = fadd <2 x float> %372, splat (float 1.000000e+00), !dbg !58 + %376 = insertelement <2 x float> poison, float %335, i64 0, !dbg !59 + %377 = shufflevector <2 x float> %376, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !59 + %378 = fsub <2 x float> %373, %377, !dbg !59 + %379 = insertelement <2 x float> poison, float %.0.i16, i64 0, !dbg !60 + %380 = shufflevector <2 x float> %379, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !60 + %381 = fmul <2 x float> %380, %378, !dbg !60 + %382 = fmul <2 x float> %375, %381, !dbg !61 + %383 = fadd <2 x float> %382, %374, !dbg !62 + %384 = fptrunc <2 x float> %383 to <2 x bfloat>, !dbg !63 + %385 = fpext <2 x bfloat> %355 to <2 x float>, !dbg !55 + %386 = fpext <2 x bfloat> %363 to <2 x float>, !dbg !56 + %387 = fpext <2 x bfloat> %370 to <2 x float>, !dbg !57 + %388 = fadd <2 x float> %385, splat (float 1.000000e+00), !dbg !58 + %389 = fsub <2 x float> %386, %377, !dbg !59 + %390 = fmul <2 x float> %380, %389, !dbg !60 + %391 = fmul <2 x float> %388, %390, !dbg !61 + %392 = fadd <2 x float> %391, %387, !dbg !62 + %393 = fptrunc <2 x float> %392 to <2 x bfloat>, !dbg !63 + %394 = bitcast <2 x bfloat> %384 to i32, !dbg !63 + %395 = bitcast <2 x bfloat> %393 to i32, !dbg !63 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %394, i32 %395, ptr addrspace(1) %371, i1 %9) #6, !dbg !63 + %396 = or disjoint i64 %347, 2048, !dbg !64 + %397 = getelementptr bfloat, ptr addrspace(1) %1, i64 %396, !dbg !47 + %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48 + %399 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %397, i64 %398, i1 true) #6, !dbg !48 + %400 = extractvalue { i32, i32 } %399, 0, !dbg !48 + %401 = bitcast i32 %400 to <2 x bfloat>, !dbg !48 + %402 = extractvalue { i32, i32 } %399, 1, !dbg !48 + %403 = bitcast i32 %402 to <2 x bfloat>, !dbg !48 + %404 = or disjoint i64 %396, %348, !dbg !49 + %405 = getelementptr bfloat, ptr addrspace(1) %0, i64 %404, !dbg !50 + %406 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51 + %407 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %405, i64 %406, i1 %9) #6, !dbg !51 + %408 = extractvalue { i32, i32 } %407, 0, !dbg !51 + %409 = bitcast i32 %408 to <2 x bfloat>, !dbg !51 + %410 = extractvalue { i32, i32 } %407, 1, !dbg !51 + %411 = bitcast i32 %410 to <2 x bfloat>, !dbg !51 + %412 = getelementptr bfloat, ptr addrspace(1) %2, i64 %396, !dbg !52 + %413 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53 + %414 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %412, i64 %413, i1 true) #6, !dbg !53 + %415 = extractvalue { i32, i32 } %414, 0, !dbg !53 + %416 = bitcast i32 %415 to <2 x bfloat>, !dbg !53 + %417 = extractvalue { i32, i32 } %414, 1, !dbg !53 + %418 = bitcast i32 %417 to <2 x bfloat>, !dbg !53 + %419 = getelementptr bfloat, ptr addrspace(1) %3, i64 %404, !dbg !54 + %420 = fpext <2 x bfloat> %401 to <2 x float>, !dbg !55 + %421 = fpext <2 x bfloat> %409 to <2 x float>, !dbg !56 + %422 = fpext <2 x bfloat> %416 to <2 x float>, !dbg !57 + %423 = fadd <2 x float> %420, splat (float 1.000000e+00), !dbg !58 + %424 = fsub <2 x float> %421, %377, !dbg !59 + %425 = fmul <2 x float> %380, %424, !dbg !60 + %426 = fmul <2 x float> %423, %425, !dbg !61 + %427 = fadd <2 x float> %426, %422, !dbg !62 + %428 = fptrunc <2 x float> %427 to <2 x bfloat>, !dbg !63 + %429 = fpext <2 x bfloat> %403 to <2 x float>, !dbg !55 + %430 = fpext <2 x bfloat> %411 to <2 x float>, !dbg !56 + %431 = fpext <2 x bfloat> %418 to <2 x float>, !dbg !57 + %432 = fadd <2 x float> %429, splat (float 1.000000e+00), !dbg !58 + %433 = fsub <2 x float> %430, %377, !dbg !59 + %434 = fmul <2 x float> %380, %433, !dbg !60 + %435 = fmul <2 x float> %432, %434, !dbg !61 + %436 = fadd <2 x float> %435, %431, !dbg !62 + %437 = fptrunc <2 x float> %436 to <2 x bfloat>, !dbg !63 + %438 = bitcast <2 x bfloat> %428 to i32, !dbg !63 + %439 = bitcast <2 x bfloat> %437 to i32, !dbg !63 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %438, i32 %439, ptr addrspace(1) %419, i1 %9) #6, !dbg !63 + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 25, column: 21, scope: !5) +!10 = !DILocation(line: 26, column: 37, scope: !5) +!11 = !DILocation(line: 38, column: 46, scope: !5) +!12 = !DILocation(line: 32, column: 43, scope: !5) +!13 = !DILocation(line: 38, column: 34, scope: !5) +!14 = !DILocation(line: 38, column: 51, scope: !5) +!15 = !DILocation(line: 38, column: 112, scope: !5) +!16 = !DILocation(line: 44, column: 62, scope: !5) +!17 = !DILocation(line: 222, column: 24, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !5, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 42, column: 51, scope: !21) +!21 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!22 = !DILocation(line: 46, column: 66, scope: !5) +!23 = !DILocation(line: 224, column: 34, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 224, column: 26, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 225, column: 39, scope: !18, inlinedAt: !20) +!26 = !DILocation(line: 225, column: 31, scope: !18, inlinedAt: !20) +!27 = !DILocation(line: 225, column: 22, scope: !18, inlinedAt: !20) +!28 = !DILocation(line: 45, column: 58, scope: !5) +!29 = !DILocation(line: 231, column: 21, scope: !18, inlinedAt: !30) +!30 = !DILocation(line: 243, column: 46, scope: !18, inlinedAt: !31) +!31 = !DILocation(line: 47, column: 79, scope: !21) +!32 = !DILocation(line: 232, column: 28, scope: !18, inlinedAt: !30) +!33 = !DILocation(line: 233, column: 39, scope: !18, inlinedAt: !30) +!34 = !DILocation(line: 233, column: 60, scope: !18, inlinedAt: !30) +!35 = !DILocation(line: 233, column: 49, scope: !18, inlinedAt: !30) +!36 = !DILocation(line: 235, column: 25, scope: !18, inlinedAt: !30) +!37 = !DILocation(line: 235, column: 17, scope: !18, inlinedAt: !30) +!38 = !DILocation(line: 236, column: 15, scope: !18, inlinedAt: !30) +!39 = !DILocation(line: 236, column: 30, scope: !18, inlinedAt: !30) +!40 = !DILocation(line: 236, column: 38, scope: !18, inlinedAt: !30) +!41 = !DILocation(line: 236, column: 49, scope: !18, inlinedAt: !30) +!42 = !DILocation(line: 236, column: 22, scope: !18, inlinedAt: !30) +!43 = !DILocation(line: 65, column: 24, scope: !5) +!44 = !DILocation(line: 67, column: 24, scope: !5) +!45 = !DILocation(line: 68, column: 32, scope: !5) +!46 = !DILocation(line: 51, column: 43, scope: !5) +!47 = !DILocation(line: 57, column: 34, scope: !5) +!48 = !DILocation(line: 57, column: 41, scope: !5) +!49 = !DILocation(line: 58, column: 42, scope: !5) +!50 = !DILocation(line: 58, column: 35, scope: !5) +!51 = !DILocation(line: 58, column: 52, scope: !5) +!52 = !DILocation(line: 59, column: 35, scope: !5) +!53 = !DILocation(line: 59, column: 42, scope: !5) +!54 = !DILocation(line: 73, column: 29, scope: !5) +!55 = !DILocation(line: 57, column: 94, scope: !5) +!56 = !DILocation(line: 58, column: 114, scope: !5) +!57 = !DILocation(line: 59, column: 95, scope: !5) +!58 = !DILocation(line: 61, column: 23, scope: !5) +!59 = !DILocation(line: 63, column: 24, scope: !5) +!60 = !DILocation(line: 69, column: 24, scope: !5) +!61 = !DILocation(line: 71, column: 24, scope: !5) +!62 = !DILocation(line: 72, column: 24, scope: !5) +!63 = !DILocation(line: 73, column: 53, scope: !5) +!64 = !DILocation(line: 52, column: 31, scope: !5) +!65 = !DILocation(line: 51, column: 4, scope: !5) diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0814d80a5b656399575e5e4401221d643f43d572 --- /dev/null +++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ptx @@ -0,0 +1,1089 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused_add_mul_native_layer_norm_0 +.visible .entry triton_red_fused_add_mul_native_layer_norm_0( + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_4, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_7 +) +.reqntid 512 +{ + .reg .pred %p<19>; + .reg .b16 %rs<33>; + .reg .b32 %r<282>; + .reg .b64 %rd<28>; + .loc 1 18 0 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd19, [triton_red_fused_add_mul_native_layer_norm_0_param_0]; + ld.param.b64 %rd20, [triton_red_fused_add_mul_native_layer_norm_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:23:28 + mov.u32 %r37, %ctaid.x; + .loc 1 25 21 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:25:21 + setp.lt.u32 %p1, %r37, 2304; + ld.param.b64 %rd21, [triton_red_fused_add_mul_native_layer_norm_0_param_2]; + ld.param.b64 %rd22, [triton_red_fused_add_mul_native_layer_norm_0_param_3]; + .loc 1 26 37 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:26:37 + mov.u32 %r38, %tid.x; + shl.b32 %r39, %r38, 2; + and.b32 %r40, %r39, 2044; + .loc 1 38 46 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:46 + shl.b32 %r41, %r37, 12; + or.b32 %r42, %r40, %r41; + .loc 1 38 34 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:34 + mad.wide.s32 %rd1, %r42, 2, %rd19; + .loc 1 38 51 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:51 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r2; + .loc 1 38 112 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112 + cvt.f32.bf16 %r43, %rs2; + cvt.f32.bf16 %r44, %rs1; + .loc 1 38 51 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:51 + mov.b32 {%rs3, %rs4}, %r1; + .loc 1 38 112 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112 + cvt.f32.bf16 %r45, %rs4; + cvt.f32.bf16 %r46, %rs3; + .loc 1 44 62 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:44:62 + selp.f32 %r47, %r46, 0f00000000, %p1; + selp.f32 %r48, %r45, 0f00000000, %p1; + selp.f32 %r49, %r44, 0f00000000, %p1; + selp.f32 %r50, %r43, 0f00000000, %p1; + .loc 1 38 34 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:34 + add.s64 %rd3, %rd1, 4096; + .loc 1 38 51 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:51 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4, %r3; + mov.u32 %r5, %r3; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4; + // end inline asm + mov.b32 {%rs5, %rs6}, %r4; + .loc 1 38 112 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112 + cvt.f32.bf16 %r51, %rs5; +$L__tmp1: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + sub.f32 %r52, %r51, %r47; +$L__tmp2: + .loc 1 46 66 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:46:66 + selp.f32 %r53, 0f40000000, 0f3F800000, %p1; +$L__tmp3: + .loc 2 224 34 // triton_helpers.py:224:34 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + div.full.f32 %r54, %r52, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + add.f32 %r55, %r47, %r54; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + sub.f32 %r56, %r51, %r55; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + fma.rn.f32 %r57, %r52, %r56, 0f00000000; +$L__tmp4: + .loc 1 38 112 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112 + cvt.f32.bf16 %r58, %rs6; +$L__tmp5: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + sub.f32 %r59, %r58, %r48; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + div.full.f32 %r60, %r59, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + add.f32 %r61, %r48, %r60; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + sub.f32 %r62, %r58, %r61; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + fma.rn.f32 %r63, %r59, %r62, 0f00000000; +$L__tmp6: + .loc 1 38 51 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:51 + mov.b32 {%rs7, %rs8}, %r5; + .loc 1 38 112 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112 + cvt.f32.bf16 %r64, %rs7; +$L__tmp7: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + sub.f32 %r65, %r64, %r49; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + div.full.f32 %r66, %r65, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + add.f32 %r67, %r49, %r66; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + sub.f32 %r68, %r64, %r67; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + fma.rn.f32 %r69, %r65, %r68, 0f00000000; +$L__tmp8: + .loc 1 38 112 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112 + cvt.f32.bf16 %r70, %rs8; +$L__tmp9: + .loc 2 222 24 // triton_helpers.py:222:24 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + sub.f32 %r71, %r70, %r50; + .loc 2 224 34 // triton_helpers.py:224:34 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + div.full.f32 %r72, %r71, %r53; + .loc 2 224 26 // triton_helpers.py:224:26 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + add.f32 %r73, %r50, %r72; + .loc 2 225 39 // triton_helpers.py:225:39 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + sub.f32 %r74, %r70, %r73; + .loc 2 225 22 // triton_helpers.py:225:22 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ] + fma.rn.f32 %r75, %r71, %r74, 0f00000000; +$L__tmp10: + .loc 1 44 62 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:44:62 + selp.f32 %r76, %r55, 0f00000000, %p1; + selp.f32 %r77, %r61, 0f00000000, %p1; + selp.f32 %r78, %r67, 0f00000000, %p1; + selp.f32 %r79, %r73, 0f00000000, %p1; + .loc 1 45 58 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:45:58 + selp.f32 %r80, %r69, 0f00000000, %p1; + selp.f32 %r81, %r75, 0f00000000, %p1; + .loc 1 46 66 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:46:66 + selp.f32 %r82, 0f40000000, 0f00000000, %p1; + .loc 1 26 37 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:26:37 + and.b32 %r83, %r38, 511; + and.b32 %r84, %r38, 31; +$L__tmp11: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r85, %r77, %r76; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r86, 0f40800000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p6, %r86, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r87, %r82, %r86; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r88, 0f00000000, %r87, %p6; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r89, %r85, %r88, %r76; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r90, %r57, %r63; + selp.f32 %r91, %r90, 0f00000000, %p1; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r92, %r85, %r85; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r93, %r92, %r82; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r94, %r93, %r88, %r91; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r95, %r78, %r89; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r96, 0f40C00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p7, %r96, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r97, %r82, %r96; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r98, 0f00000000, %r97, %p7; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r99, %r98, %r95, %r89; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r100, %r80, %r94; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r101, %r95, %r95; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r102, %r86, %r101; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r103, %r98, %r102, %r100; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r104, %r79, %r99; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r105, 0f41000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p8, %r105, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r106, %r82, %r105; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r107, 0f00000000, %r106, %p8; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r108, %r107, %r104, %r99; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r109, %r81, %r103; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r110, %r104, %r104; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r111, %r96, %r110; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r112, %r107, %r111, %r109; +$L__tmp12: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r113, %r108, 16, 31, -1; + shfl.sync.bfly.b32 %r114, %r112, 16, 31, -1; + shfl.sync.bfly.b32 %r115, %r105, 16, 31, -1; +$L__tmp13: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r116, %r113, %r108; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r117, %r105, %r115; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p9, %r117, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r118, %r115, %r117; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r119, 0f00000000, %r118, %p9; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r120, %r119, %r116, %r108; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r121, %r112, %r114; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r122, %r116, %r116; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r123, %r105, %r122; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r124, %r119, %r123, %r121; +$L__tmp14: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r125, %r120, 8, 31, -1; + shfl.sync.bfly.b32 %r126, %r124, 8, 31, -1; + shfl.sync.bfly.b32 %r127, %r117, 8, 31, -1; +$L__tmp15: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r128, %r125, %r120; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r129, %r117, %r127; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p10, %r129, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r130, %r127, %r129; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r131, 0f00000000, %r130, %p10; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r132, %r128, %r131, %r120; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r133, %r124, %r126; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r134, %r128, %r128; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r135, %r117, %r134; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r136, %r131, %r135, %r133; +$L__tmp16: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r137, %r132, 4, 31, -1; + shfl.sync.bfly.b32 %r138, %r136, 4, 31, -1; + shfl.sync.bfly.b32 %r139, %r129, 4, 31, -1; +$L__tmp17: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r140, %r137, %r132; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r141, %r129, %r139; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p11, %r141, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r142, %r139, %r141; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r143, 0f00000000, %r142, %p11; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r144, %r140, %r143, %r132; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r145, %r136, %r138; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r146, %r140, %r140; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r147, %r129, %r146; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r148, %r143, %r147, %r145; +$L__tmp18: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r149, %r144, 2, 31, -1; + shfl.sync.bfly.b32 %r150, %r148, 2, 31, -1; + shfl.sync.bfly.b32 %r151, %r141, 2, 31, -1; +$L__tmp19: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r152, %r149, %r144; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r153, %r141, %r151; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p12, %r153, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r154, %r151, %r153; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r155, 0f00000000, %r154, %p12; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r156, %r152, %r155, %r144; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r157, %r148, %r150; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r158, %r152, %r152; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r159, %r141, %r158; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r160, %r155, %r159, %r157; +$L__tmp20: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r161, %r156, 1, 31, -1; + shfl.sync.bfly.b32 %r162, %r160, 1, 31, -1; + shfl.sync.bfly.b32 %r163, %r153, 1, 31, -1; +$L__tmp21: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r164, %r161, %r156; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r11, %r153, %r163; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p13, %r11, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r165, %r163, %r11; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r166, 0f00000000, %r165, %p13; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r7, %r164, %r166, %r156; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r167, %r160, %r162; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r168, %r164, %r164; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r169, %r153, %r168; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r9, %r166, %r169, %r167; +$L__tmp22: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + setp.eq.b32 %p2, %r84, 0; + shr.u32 %r170, %r38, 3; + and.b32 %r171, %r170, 60; + mov.b32 %r172, global_smem; + add.s32 %r6, %r172, %r171; + // begin inline asm + @%p2 st.shared.b32 [ %r6 + 0 ], %r7; + // end inline asm + add.s32 %r8, %r6, 64; + // begin inline asm + @%p2 st.shared.b32 [ %r8 + 0 ], %r9; + // end inline asm + add.s32 %r10, %r6, 128; + // begin inline asm + @%p2 st.shared.b32 [ %r10 + 0 ], %r11; + // end inline asm + bar.sync 0; + setp.lt.u32 %p3, %r83, 16; + shl.b32 %r173, %r83, 2; + add.s32 %r13, %r172, %r173; + // begin inline asm + @%p3 ld.shared.b32 %r12, [ %r13 + 0 ]; + // end inline asm + add.s32 %r15, %r13, 64; + // begin inline asm + @%p3 ld.shared.b32 %r14, [ %r15 + 0 ]; + // end inline asm + add.s32 %r17, %r13, 128; + // begin inline asm + @%p3 ld.shared.b32 %r16, [ %r17 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r174, %r12, 8, 31, -1; + shfl.sync.bfly.b32 %r175, %r14, 8, 31, -1; + shfl.sync.bfly.b32 %r176, %r16, 8, 31, -1; +$L__tmp23: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r177, %r174, %r12; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r178, %r16, %r176; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p14, %r178, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r179, %r176, %r178; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r180, 0f00000000, %r179, %p14; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r181, %r177, %r180, %r12; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r182, %r14, %r175; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r183, %r177, %r177; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r184, %r183, %r16; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r185, %r184, %r180, %r182; +$L__tmp24: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r186, %r181, 4, 31, -1; + shfl.sync.bfly.b32 %r187, %r185, 4, 31, -1; + shfl.sync.bfly.b32 %r188, %r178, 4, 31, -1; +$L__tmp25: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r189, %r186, %r181; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r190, %r178, %r188; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p15, %r190, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r191, %r188, %r190; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r192, 0f00000000, %r191, %p15; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r193, %r189, %r192, %r181; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r194, %r185, %r187; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r195, %r189, %r189; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r196, %r178, %r195; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r197, %r192, %r196, %r194; +$L__tmp26: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r198, %r193, 2, 31, -1; + shfl.sync.bfly.b32 %r199, %r197, 2, 31, -1; + shfl.sync.bfly.b32 %r200, %r190, 2, 31, -1; +$L__tmp27: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r201, %r198, %r193; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r202, %r190, %r200; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p16, %r202, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r203, %r200, %r202; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r204, 0f00000000, %r203, %p16; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r205, %r201, %r204, %r193; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r206, %r197, %r199; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r207, %r201, %r201; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r208, %r190, %r207; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r209, %r204, %r208, %r206; +$L__tmp28: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r210, %r205, 1, 31, -1; + shfl.sync.bfly.b32 %r211, %r209, 1, 31, -1; + shfl.sync.bfly.b32 %r212, %r202, 1, 31, -1; +$L__tmp29: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r213, %r210, %r205; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r20, %r202, %r212; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p17, %r20, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r214, %r212, %r20; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r215, 0f00000000, %r214, %p17; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r18, %r213, %r215, %r205; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r216, %r209, %r211; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r217, %r213, %r213; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r218, %r202, %r217; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r19, %r215, %r218, %r216; +$L__tmp30: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + and.b32 %r219, %r38, 15; + setp.eq.b32 %p18, %r219, 0; + and.pred %p4, %p3, %p18; + // begin inline asm + @%p4 st.shared.b32 [ %r13 + 0 ], %r18; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r15 + 0 ], %r19; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r17 + 0 ], %r20; + // end inline asm + bar.sync 0; + ld.shared.b32 %r220, [global_smem]; + ld.shared.b32 %r221, [global_smem+64]; + mov.b32 %r222, 0f45800000; +$L__tmp31: + .loc 1 65 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:65:24 + div.full.f32 %r223, %r221, %r222; + .loc 1 67 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:67:24 + add.f32 %r224, %r223, 0f358637BD; + .loc 1 68 32 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:68:32 + rsqrt.approx.ftz.f32 %r225, %r224; + .loc 1 51 43 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:51:43 + cvt.u64.u32 %rd23, %r40; + cvt.s64.s32 %rd24, %r41; + .loc 1 57 34 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:34 + mul.wide.u32 %rd25, %r40, 2; + add.s64 %rd5, %rd20, %rd25; + .loc 1 57 41 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:41 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + mov.pred %p5, -1; + // begin inline asm + mov.u32 %r21, %r3; + mov.u32 %r22, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r21, %r22 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 58 42 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:42 + or.b64 %rd26, %rd23, %rd24; + .loc 1 58 35 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:35 + shl.b64 %rd27, %rd26, 1; + add.s64 %rd7, %rd19, %rd27; + .loc 1 58 52 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:52 + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r23, %r3; + mov.u32 %r24, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r23, %r24 }, [ %rd7 + 0 ], %rd8; + // end inline asm + .loc 1 59 35 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:35 + add.s64 %rd9, %rd21, %rd25; + .loc 1 59 42 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:42 + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r3; + mov.u32 %r26, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r25, %r26 }, [ %rd9 + 0 ], %rd10; + // end inline asm + .loc 1 73 29 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:29 + add.s64 %rd11, %rd22, %rd27; + .loc 1 57 94 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94 + mov.b32 {%rs9, %rs10}, %r21; + cvt.f32.bf16 %r226, %rs9; + cvt.f32.bf16 %r227, %rs10; + .loc 1 58 114 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114 + mov.b32 {%rs11, %rs12}, %r23; + cvt.f32.bf16 %r228, %rs12; + cvt.f32.bf16 %r229, %rs11; + .loc 1 59 95 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95 + mov.b32 {%rs13, %rs14}, %r25; + cvt.f32.bf16 %r230, %rs14; + cvt.f32.bf16 %r231, %rs13; + .loc 1 61 23 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23 + add.f32 %r232, %r227, 0f3F800000; + add.f32 %r233, %r226, 0f3F800000; + .loc 1 63 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24 + sub.f32 %r234, %r229, %r220; + sub.f32 %r235, %r228, %r220; + .loc 1 69 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24 + mul.f32 %r236, %r225, %r235; + mul.f32 %r237, %r225, %r234; + .loc 1 72 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24 + fma.rn.f32 %r238, %r233, %r237, %r231; + fma.rn.f32 %r239, %r232, %r236, %r230; + .loc 1 73 53 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53 + cvt.rn.bf16x2.f32 %r27, %r239, %r238; + .loc 1 57 94 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94 + mov.b32 {%rs15, %rs16}, %r22; + cvt.f32.bf16 %r240, %rs15; + cvt.f32.bf16 %r241, %rs16; + .loc 1 58 114 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114 + mov.b32 {%rs17, %rs18}, %r24; + cvt.f32.bf16 %r242, %rs18; + cvt.f32.bf16 %r243, %rs17; + .loc 1 59 95 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95 + mov.b32 {%rs19, %rs20}, %r26; + cvt.f32.bf16 %r244, %rs20; + cvt.f32.bf16 %r245, %rs19; + .loc 1 61 23 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23 + add.f32 %r246, %r241, 0f3F800000; + add.f32 %r247, %r240, 0f3F800000; + .loc 1 63 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24 + sub.f32 %r248, %r243, %r220; + sub.f32 %r249, %r242, %r220; + .loc 1 69 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24 + mul.f32 %r250, %r225, %r249; + mul.f32 %r251, %r225, %r248; + .loc 1 72 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24 + fma.rn.f32 %r252, %r247, %r251, %r245; + fma.rn.f32 %r253, %r246, %r250, %r244; + .loc 1 73 53 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53 + cvt.rn.bf16x2.f32 %r28, %r253, %r252; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd11 + 0 ], { %r27, %r28 }; + // end inline asm + .loc 1 57 34 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:34 + add.s64 %rd12, %rd5, 4096; + .loc 1 57 41 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:41 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r29, %r3; + mov.u32 %r30, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r29, %r30 }, [ %rd12 + 0 ], %rd13; + // end inline asm + .loc 1 58 35 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:35 + add.s64 %rd14, %rd7, 4096; + .loc 1 58 52 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:52 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r31, %r3; + mov.u32 %r32, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r31, %r32 }, [ %rd14 + 0 ], %rd15; + // end inline asm + .loc 1 59 35 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:35 + add.s64 %rd16, %rd9, 4096; + .loc 1 59 42 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:42 + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r33, %r3; + mov.u32 %r34, %r3; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd16 + 0 ], %rd17; + // end inline asm + .loc 1 73 29 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:29 + add.s64 %rd18, %rd11, 4096; + .loc 1 57 94 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94 + mov.b32 {%rs21, %rs22}, %r29; + cvt.f32.bf16 %r254, %rs21; + cvt.f32.bf16 %r255, %rs22; + .loc 1 58 114 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114 + mov.b32 {%rs23, %rs24}, %r31; + cvt.f32.bf16 %r256, %rs24; + cvt.f32.bf16 %r257, %rs23; + .loc 1 59 95 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95 + mov.b32 {%rs25, %rs26}, %r33; + cvt.f32.bf16 %r258, %rs26; + cvt.f32.bf16 %r259, %rs25; + .loc 1 61 23 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23 + add.f32 %r260, %r255, 0f3F800000; + add.f32 %r261, %r254, 0f3F800000; + .loc 1 63 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24 + sub.f32 %r262, %r257, %r220; + sub.f32 %r263, %r256, %r220; + .loc 1 69 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24 + mul.f32 %r264, %r225, %r263; + mul.f32 %r265, %r225, %r262; + .loc 1 72 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24 + fma.rn.f32 %r266, %r261, %r265, %r259; + fma.rn.f32 %r267, %r260, %r264, %r258; + .loc 1 73 53 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53 + cvt.rn.bf16x2.f32 %r35, %r267, %r266; + .loc 1 57 94 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94 + mov.b32 {%rs27, %rs28}, %r30; + cvt.f32.bf16 %r268, %rs27; + cvt.f32.bf16 %r269, %rs28; + .loc 1 58 114 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114 + mov.b32 {%rs29, %rs30}, %r32; + cvt.f32.bf16 %r270, %rs30; + cvt.f32.bf16 %r271, %rs29; + .loc 1 59 95 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95 + mov.b32 {%rs31, %rs32}, %r34; + cvt.f32.bf16 %r272, %rs32; + cvt.f32.bf16 %r273, %rs31; + .loc 1 61 23 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23 + add.f32 %r274, %r269, 0f3F800000; + add.f32 %r275, %r268, 0f3F800000; + .loc 1 63 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24 + sub.f32 %r276, %r271, %r220; + sub.f32 %r277, %r270, %r220; + .loc 1 69 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24 + mul.f32 %r278, %r225, %r277; + mul.f32 %r279, %r225, %r276; + .loc 1 72 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24 + fma.rn.f32 %r280, %r275, %r279, %r273; + fma.rn.f32 %r281, %r274, %r278, %r272; + .loc 1 73 53 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53 + cvt.rn.bf16x2.f32 %r36, %r281, %r280; + // begin inline asm + @%p1 st.global.v2.b32 [ %rd18 + 0 ], { %r35, %r36 }; + // end inline asm + .loc 1 51 4 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:51:4 + ret; +$L__tmp32: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 367 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x168 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 112 +.b8 103 +.b8 101 +.b8 115 +.b8 122 +.b8 104 +.b8 52 +.b8 110 +.b8 112 +.b8 121 +.b8 110 +.b8 121 +.b8 55 +.b8 117 +.b8 50 +.b8 113 +.b8 120 +.b8 108 +.b8 107 +.b8 116 +.b8 112 +.b8 118 +.b8 50 +.b8 121 +.b8 50 +.b8 120 +.b8 100 +.b8 103 +.b8 103 +.b8 122 +.b8 121 +.b8 108 +.b8 53 +.b8 111 +.b8 112 +.b8 111 +.b8 121 +.b8 51 +.b8 111 +.b8 114 +.b8 117 +.b8 113 +.b8 115 +.b8 113 +.b8 101 +.b8 116 +.b8 52 +.b8 112 +.b8 53 +.b8 101 +.b8 107 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 112 +.b8 103 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 97 +.b8 116 +.b8 105 +.b8 118 +.b8 101 +.b8 95 +.b8 108 +.b8 97 +.b8 121 +.b8 101 +.b8 114 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x113:0x5f DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x128:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 51 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x140:0x31 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp11 // DW_AT_low_pc +.b64 $L__tmp31 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 47 // DW_AT_call_line +.b8 79 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x158:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp11 // DW_AT_low_pc +.b64 $L__tmp30 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 243 // DW_AT_call_line +.b8 46 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.source new file mode 100644 index 0000000000000000000000000000000000000000..ccafe4a89244b01843849b5d6816b92e0e34f71e --- /dev/null +++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.source @@ -0,0 +1,420 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0) +#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0) +#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0) +#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0) +#loc91 = loc(unknown) +#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0) +#loc109 = loc("in_ptr0"(#loc)) +#loc110 = loc("in_ptr1"(#loc)) +#loc111 = loc("in_ptr2"(#loc)) +#loc112 = loc("out_ptr2"(#loc)) +#loc113 = loc("xnumel"(#loc)) +#loc114 = loc("r0_numel"(#loc)) +#loc171 = loc("value"(#loc72)) +#loc172 = loc("mean"(#loc72)) +#loc173 = loc("m2"(#loc72)) +#loc174 = loc("weight"(#loc72)) +#loc175 = loc("first_iteration"(#loc72)) +#loc185 = loc("input"(#loc85)) +#loc186 = loc("mean"(#loc89)) +#loc187 = loc("m2"(#loc89)) +#loc188 = loc("weight"(#loc89)) +#loc189 = loc("mean_1"(#loc94)) +#loc190 = loc("m2_1"(#loc94)) +#loc191 = loc("weight_1"(#loc94)) +#loc192 = loc("mean_2"(#loc94)) +#loc193 = loc("m2_2"(#loc94)) +#loc194 = loc("weight_2"(#loc94)) +#loc201 = loc("new_mean"(#loc171)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2304 : i32 loc(#loc115) + %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116) + %xoffset = tt.get_program_id x : i32 loc(#loc117) + %xoffset_2 = arith.constant 1 : i32 loc(#loc118) + %xoffset_3 = arith.constant 1 : i32 loc(#loc118) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121) + %xmask = arith.constant dense<2304> : tensor<1x1xi32> loc(#loc122) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc123) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc124) + %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc125) + %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc126) + %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc127) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc129) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc129) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc130) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc130) + %tmp0 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_18 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc132) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc132) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc133) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc133) + %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc134) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc134) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc135) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc135) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc135) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc136) + %c0_i32_32 = arith.constant 0 : i32 loc(#loc23) + %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23) + %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, i1) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc24) + %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc137) + %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x2048xi1> loc(#loc137) + %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc138) + %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc139) + %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x2048xi1> loc(#loc139) + %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc140) + %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc141) + %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x2048xi1> loc(#loc141) + %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc142) + scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc31) + } loc(#loc207) + %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143) + %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144) + %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc36) + %c2048_i32_12 = arith.constant 2048 : i32 loc(#loc36) + %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36) + %7 = arith.bitcast %c2048_i32_12 : i32 to i32 loc(#loc36) + %8 = ub.poison : i32 loc(#loc36) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc146) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc146) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc147) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc147) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc148) + %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc148) + %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149) + %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc149) + %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc149) + %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc149) + %tmp9_20 = arith.extf %tmp9_19 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc150) + %tmp12 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_21 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151) + %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151) + %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc152) + %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x2048xi32> loc(#loc152) + %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc153) + %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc153) + %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc154) + %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x2048xi1> loc(#loc154) + %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc155) + %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc155) + %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc155) + %tmp12_34 = arith.extf %tmp12_33 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc156) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc157) + %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc157) + %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc158) + %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc158) + %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc158) + %tmp23_40 = arith.extf %tmp23_39 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc159) + %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160) + %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc161) + %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x2048xf32> loc(#loc161) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc162) + %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x2048xf32> loc(#loc162) + %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163) + %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164) + %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164) + %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165) + %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166) + %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166) + %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc168) + %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x2048xf32> loc(#loc168) + %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x2048xf32> loc(#loc169) + %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x2048xf32> loc(#loc170) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc62) + %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc63) + %11 = arith.addi %r0_index_13, %10 : tensor<1x2048xi32> loc(#loc63) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc64) + %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc64) + %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc65) + %15 = arith.andi %r0_mask_14, %14 : tensor<1x2048xi1> loc(#loc65) + %16 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc66) + tt.store %13, %16, %15 : tensor<1x2048x!tt.ptr> loc(#loc66) + } loc(#loc36) + tt.return loc(#loc67) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc69) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc69) + tt.return %cst_0 : tensor<1x2048xf32> loc(#loc70) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc71) + tt.return %0 : tensor<1x2048xf32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%new_mean: tensor<1x2048xf32> loc("new_mean"(#loc171)), %mean: tensor<1x2048xf32> loc("mean"(#loc72)), %m2: tensor<1x2048xf32> loc("m2"(#loc72)), %weight: tensor<1x2048xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} { + %0:3 = scf.if %first_iteration -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) { + %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176) + %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc202) + %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_2048S__(%m2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc203) + scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc203) + } else { + %delta = arith.subf %new_mean, %mean : tensor<1x2048xf32> loc(#loc178) + %new_weight = arith.constant 1 : i32 loc(#loc179) + %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179) + %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc179) + %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x2048xf32> loc(#loc204) + %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x2048xf32> loc(#loc180) + %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x2048xf32> loc(#loc205) + %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x2048xf32> loc(#loc182) + %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc183) + %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x2048xf32> loc(#loc206) + scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc184) + } loc(#loc73) + tt.return %0#1, %0#0, %0#2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc84) + %2 = ub.poison : tensor<1x2048xf32> loc(#loc84) + %3 = ub.poison : tensor<1x2048xf32> loc(#loc84) + tt.return %1, %2, %3 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc84) + } loc(#loc72) + tt.func private @triton.language.standard.zeros_like__fp32S1_2048S__(%input: tensor<1x2048xf32> loc("input"(#loc85))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc86) + tt.return %0 : tensor<1x2048xf32> loc(#loc87) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc88) + tt.return %1 : tensor<1x2048xf32> loc(#loc88) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%mean: tensor<1x2048xf32> loc("mean"(#loc89)), %m2: tensor<1x2048xf32> loc("m2"(#loc89)), %weight: tensor<1x2048xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({ + ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): + %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90) + tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90) + }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90) + tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc93) + %2 = ub.poison : tensor<1xf32> loc(#loc93) + %3 = ub.poison : tensor<1xf32> loc(#loc93) + tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93) + } loc(#loc89) + tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} { + %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195) + %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196) + %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197) + %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197) + %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198) + %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199) + %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100) + %1 = arith.addf %mean_1, %0 : f32 loc(#loc101) + %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102) + %3 = arith.mulf %delta, %delta : f32 loc(#loc103) + %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104) + %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105) + %6 = arith.addf %2, %5 : f32 loc(#loc106) + tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107) + ^bb1: // no predecessors + %7 = ub.poison : f32 loc(#loc108) + %8 = ub.poison : f32 loc(#loc108) + %9 = ub.poison : f32 loc(#loc108) + tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108) + } loc(#loc94) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":29:45) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":30:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":31:47) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:62) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:51) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:37) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:58) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:41) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:8) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":50:16) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:43) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":52:31) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":53:29) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:47) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:42) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:62) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":60:16) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":64:16) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":66:16) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:41) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:36) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:63) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4) +#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0) +#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31) +#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11) +#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4) +#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46) +#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11) +#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4) +#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30) +#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11) +#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4) +#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11) +#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4) +#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11) +#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4) +#loc115 = loc("xnumel"(#loc1)) +#loc116 = loc("r0_numel"(#loc2)) +#loc117 = loc("xoffset"(#loc3)) +#loc118 = loc("xoffset"(#loc4)) +#loc119 = loc("xindex"(#loc5)) +#loc120 = loc("xindex"(#loc6)) +#loc121 = loc("xindex"(#loc7)) +#loc122 = loc("xmask"(#loc8)) +#loc123 = loc("r0_base"(#loc9)) +#loc124 = loc("r0_base"(#loc10)) +#loc125 = loc("tmp3_mean"(#loc11)) +#loc126 = loc("tmp3_m2"(#loc12)) +#loc127 = loc("tmp3_weight"(#loc13)) +#loc128 = loc("tmp3_mean"(#loc14)) +#loc129 = loc("r0_index"(#loc15)) +#loc130 = loc("r0_mask"(#loc16)) +#loc131 = loc("tmp0"(#loc17)) +#loc132 = loc("tmp0"(#loc18)) +#loc133 = loc("tmp0"(#loc19)) +#loc134 = loc("tmp0"(#loc20)) +#loc135 = loc("tmp0"(#loc21)) +#loc136 = loc("tmp0"(#loc22)) +#loc137 = loc("tmp3_mean"(#loc25)) +#loc138 = loc("tmp3_mean"(#loc26)) +#loc139 = loc("tmp3_m2"(#loc27)) +#loc140 = loc("tmp3_m2"(#loc28)) +#loc141 = loc("tmp3_weight"(#loc29)) +#loc142 = loc("tmp3_weight"(#loc30)) +#loc143 = loc("tmp3"(#loc33)) +#loc144 = loc("tmp7"(#loc34)) +#loc145 = loc("tmp8"(#loc35)) +#loc146 = loc("r0_index"(#loc37)) +#loc147 = loc("r0_mask"(#loc38)) +#loc148 = loc("tmp9"(#loc39)) +#loc149 = loc("tmp9"(#loc40)) +#loc150 = loc("tmp9"(#loc41)) +#loc151 = loc("tmp12"(#loc42)) +#loc152 = loc("tmp12"(#loc43)) +#loc153 = loc("tmp12"(#loc44)) +#loc154 = loc("tmp12"(#loc45)) +#loc155 = loc("tmp12"(#loc46)) +#loc156 = loc("tmp12"(#loc47)) +#loc157 = loc("tmp23"(#loc48)) +#loc158 = loc("tmp23"(#loc49)) +#loc159 = loc("tmp23"(#loc50)) +#loc160 = loc("tmp10"(#loc51)) +#loc161 = loc("tmp11"(#loc52)) +#loc162 = loc("tmp14"(#loc53)) +#loc163 = loc("tmp15"(#loc54)) +#loc164 = loc("tmp16"(#loc55)) +#loc165 = loc("tmp17"(#loc56)) +#loc166 = loc("tmp18"(#loc57)) +#loc167 = loc("tmp19"(#loc58)) +#loc168 = loc("tmp20"(#loc59)) +#loc169 = loc("tmp22"(#loc60)) +#loc170 = loc("tmp24"(#loc61)) +#loc176 = loc("new_weight"(#loc74)) +#loc177 = loc("new_m2"(#loc75)) +#loc178 = loc("delta"(#loc76)) +#loc179 = loc("new_weight"(#loc77)) +#loc180 = loc("new_mean"(#loc78)) +#loc181 = loc("new_mean"(#loc79)) +#loc182 = loc("new_m2"(#loc80)) +#loc183 = loc("new_m2"(#loc81)) +#loc184 = loc("new_m2"(#loc82)) +#loc195 = loc("delta"(#loc95)) +#loc196 = loc("new_weight"(#loc96)) +#loc197 = loc("w2_over_w"(#loc97)) +#loc198 = loc("w2_over_w"(#loc98)) +#loc199 = loc("w2_over_w"(#loc99)) +#loc200 = loc("tmp3_m2"(#loc128)) +#loc202 = loc("new_weight"(#loc176)) +#loc203 = loc("new_m2"(#loc177)) +#loc204 = loc("new_weight"(#loc179)) +#loc205 = loc("new_mean"(#loc181)) +#loc206 = loc("new_m2"(#loc184)) +#loc207 = loc("tmp3_weight"(#loc200)) diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..01c28ba9fcb2607a2b5cfdbf5812a231e6550f88 --- /dev/null +++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir @@ -0,0 +1,261 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("out_ptr2"(#loc)) +#loc74 = loc("xnumel"(#loc)) +#loc75 = loc("r0_numel"(#loc)) +#loc101 = loc(callsite(#loc1 at #loc30)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1x2048xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c2304_i32 = arith.constant 2304 : i32 loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xmask = arith.cmpi slt, %xoffset, %c2304_i32 : i32 loc(#loc77) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc78) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc78) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc79) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc130) + %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc81) + %tmp0_9 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc131) + %tmp3_weight:3 = scf.for %tmp3_weight_10 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg7 = %cst_2, %arg8 = %cst_2, %arg9 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) : i32 { + %r0_index = tt.splat %tmp3_weight_10 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc84) + %r0_index_11 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc84) + %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc85) + %tmp0_12 = arith.addi %r0_index_11, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc80) + %tmp0_13 = tt.addptr %tmp0_8, %tmp0_12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc81) + %tmp0_14 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc82) + %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc86) + %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc87) + %2 = arith.cmpi eq, %tmp3_weight_10, %c0_i32 : i32 loc(#loc14) + %3:3 = scf.if %2 -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) { + scf.yield %cst_2, %tmp0_16, %cst_5 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc155) + } else { + %delta = arith.subf %tmp0_16, %arg7 : tensor<1x2048xf32, #blocked> loc(#loc134) + %new_weight = arith.addf %arg9, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc156) + %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32, #blocked> loc(#loc136) + %new_mean_18 = arith.addf %arg7, %new_mean : tensor<1x2048xf32, #blocked> loc(#loc157) + %new_m2 = arith.subf %tmp0_16, %new_mean_18 : tensor<1x2048xf32, #blocked> loc(#loc138) + %new_m2_19 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32, #blocked> loc(#loc139) + %new_m2_20 = arith.addf %arg8, %new_m2_19 : tensor<1x2048xf32, #blocked> loc(#loc158) + scf.yield %new_m2_20, %new_mean_18, %new_weight : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc141) + } loc(#loc88) + %tmp3_mean = arith.select %tmp0_14, %3#1, %arg7 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc97) + %tmp3_m2 = arith.select %tmp0_14, %3#0, %arg8 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc98) + %tmp3_weight_17 = arith.select %tmp0_14, %3#2, %arg9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc99) + scf.yield %tmp3_mean, %tmp3_m2, %tmp3_weight_17 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc28) + } loc(#loc154) + %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc30)), %arg7: f32 loc(callsite(#loc1 at #loc30)), %arg8: f32 loc(callsite(#loc1 at #loc30)), %arg9: f32 loc(callsite(#loc1 at #loc30)), %arg10: f32 loc(callsite(#loc1 at #loc30)), %arg11: f32 loc(callsite(#loc1 at #loc30))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc142) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc143) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc144) + %w2_over_w_10 = arith.divf %arg11, %new_weight : f32 loc(#loc145) + %w2_over_w_11 = arith.select %w2_over_w, %cst_1, %w2_over_w_10 : f32 loc(#loc146) + %2 = arith.mulf %delta, %w2_over_w_11 : f32 loc(#loc147) + %3 = arith.addf %arg6, %2 : f32 loc(#loc148) + %4 = arith.addf %arg7, %arg10 : f32 loc(#loc149) + %5 = arith.mulf %delta, %delta : f32 loc(#loc150) + %6 = arith.mulf %5, %arg8 : f32 loc(#loc151) + %7 = arith.mulf %6, %w2_over_w_11 : f32 loc(#loc152) + %8 = arith.addf %4, %7 : f32 loc(#loc153) + tt.reduce.return %3, %8, %new_weight : f32, f32, f32 loc(#loc100) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc100) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc107) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc108) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc109) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc110) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc111) + %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc112) + %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc113) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc114) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc115) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc52) + scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc116) + %r0_index_10 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc116) + %r0_mask = arith.cmpi slt, %r0_index_10, %cst : tensor<1x2048xi32, #blocked> loc(#loc117) + %tmp9_11 = tt.addptr %tmp9, %r0_index_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc109) + %tmp9_12 = tt.load %tmp9_11, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc118) + %tmp9_13 = arith.extf %tmp9_12 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc119) + %tmp12 = arith.addi %r0_index_10, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc120) + %tmp12_14 = tt.addptr %tmp0_8, %tmp12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc121) + %tmp12_15 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc122) + %tmp12_16 = tt.load %tmp12_14, %tmp12_15, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc123) + %tmp12_17 = arith.extf %tmp12_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc124) + %tmp23_18 = tt.addptr %tmp23, %r0_index_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc110) + %tmp23_19 = tt.load %tmp23_18, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc125) + %tmp23_20 = arith.extf %tmp23_19 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc126) + %tmp11 = arith.addf %tmp9_13, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc127) + %tmp14_21 = arith.subf %tmp12_17, %tmp14 : tensor<1x2048xf32, #blocked> loc(#loc111) + %tmp20_22 = arith.mulf %tmp14_21, %tmp20 : tensor<1x2048xf32, #blocked> loc(#loc115) + %tmp22 = arith.mulf %tmp11, %tmp20_22 : tensor<1x2048xf32, #blocked> loc(#loc128) + %tmp24 = arith.addf %tmp22, %tmp23_20 : tensor<1x2048xf32, #blocked> loc(#loc129) + %2 = tt.addptr %1, %tmp12 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc52) + %3 = arith.truncf %tmp24 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc68) + tt.store %2, %3, %tmp12_15 : tensor<1x2048x!tt.ptr, #blocked> loc(#loc68) + } loc(#loc53) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":32:43) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":33:31) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:62) +#loc15 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:51) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:58) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:8) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:43) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":52:31) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":53:29) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:42) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:35) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:62) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4) +#loc76 = loc("xoffset"(#loc2)) +#loc77 = loc("xmask"(#loc3)) +#loc78 = loc("r0_base"(#loc4)) +#loc79 = loc("tmp0"(#loc5)) +#loc80 = loc("tmp0"(#loc6)) +#loc81 = loc("tmp0"(#loc7)) +#loc82 = loc("tmp0"(#loc8)) +#loc83 = loc("tmp3_mean"(#loc9)) +#loc84 = loc("r0_index"(#loc10)) +#loc85 = loc("r0_mask"(#loc11)) +#loc86 = loc("tmp0"(#loc12)) +#loc87 = loc("tmp0"(#loc13)) +#loc88 = loc(callsite(#loc15 at #loc16)) +#loc89 = loc("new_m2"(#loc17)) +#loc90 = loc("delta"(#loc18)) +#loc91 = loc("new_weight"(#loc19)) +#loc92 = loc("new_mean"(#loc20)) +#loc93 = loc("new_mean"(#loc21)) +#loc94 = loc("new_m2"(#loc22)) +#loc95 = loc("new_m2"(#loc23)) +#loc96 = loc("new_m2"(#loc24)) +#loc97 = loc("tmp3_mean"(#loc25)) +#loc98 = loc("tmp3_m2"(#loc26)) +#loc99 = loc("tmp3_weight"(#loc27)) +#loc100 = loc(callsite(#loc29 at #loc30)) +#loc102 = loc("delta"(#loc31)) +#loc103 = loc("new_weight"(#loc32)) +#loc104 = loc("w2_over_w"(#loc33)) +#loc105 = loc("w2_over_w"(#loc34)) +#loc106 = loc("w2_over_w"(#loc35)) +#loc107 = loc("tmp3"(#loc43)) +#loc108 = loc("tmp7"(#loc44)) +#loc109 = loc("tmp9"(#loc45)) +#loc110 = loc("tmp23"(#loc46)) +#loc111 = loc("tmp14"(#loc47)) +#loc112 = loc("tmp16"(#loc48)) +#loc113 = loc("tmp18"(#loc49)) +#loc114 = loc("tmp19"(#loc50)) +#loc115 = loc("tmp20"(#loc51)) +#loc116 = loc("r0_index"(#loc54)) +#loc117 = loc("r0_mask"(#loc55)) +#loc118 = loc("tmp9"(#loc56)) +#loc119 = loc("tmp9"(#loc57)) +#loc120 = loc("tmp12"(#loc58)) +#loc121 = loc("tmp12"(#loc59)) +#loc122 = loc("tmp12"(#loc60)) +#loc123 = loc("tmp12"(#loc61)) +#loc124 = loc("tmp12"(#loc62)) +#loc125 = loc("tmp23"(#loc63)) +#loc126 = loc("tmp23"(#loc64)) +#loc127 = loc("tmp11"(#loc65)) +#loc128 = loc("tmp22"(#loc66)) +#loc129 = loc("tmp24"(#loc67)) +#loc130 = loc(fused[#loc80, #loc79]) +#loc131 = loc(fused[#loc82, #loc77]) +#loc132 = loc("tmp3_m2"(#loc83)) +#loc133 = loc("new_m2"(#loc89)) +#loc134 = loc(callsite(#loc90 at #loc16)) +#loc135 = loc("new_weight"(#loc91)) +#loc136 = loc(callsite(#loc92 at #loc16)) +#loc137 = loc("new_mean"(#loc93)) +#loc138 = loc(callsite(#loc94 at #loc16)) +#loc139 = loc(callsite(#loc95 at #loc16)) +#loc140 = loc("new_m2"(#loc96)) +#loc141 = loc(callsite(#loc96 at #loc16)) +#loc142 = loc(callsite(#loc102 at #loc100)) +#loc143 = loc(callsite(#loc103 at #loc100)) +#loc144 = loc(callsite(#loc104 at #loc100)) +#loc145 = loc(callsite(#loc105 at #loc100)) +#loc146 = loc(callsite(#loc106 at #loc100)) +#loc147 = loc(callsite(#loc36 at #loc100)) +#loc148 = loc(callsite(#loc37 at #loc100)) +#loc149 = loc(callsite(#loc38 at #loc100)) +#loc150 = loc(callsite(#loc39 at #loc100)) +#loc151 = loc(callsite(#loc40 at #loc100)) +#loc152 = loc(callsite(#loc41 at #loc100)) +#loc153 = loc(callsite(#loc42 at #loc100)) +#loc154 = loc("tmp3_weight"(#loc132)) +#loc155 = loc(callsite(#loc133 at #loc16)) +#loc156 = loc(callsite(#loc135 at #loc16)) +#loc157 = loc(callsite(#loc137 at #loc16)) +#loc158 = loc(callsite(#loc140 at #loc16)) diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0def56ed8ffd5e33b836ac76b5836d67fdbc827f --- /dev/null +++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttir @@ -0,0 +1,270 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0) +#loc2 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("in_ptr1"(#loc)) +#loc74 = loc("in_ptr2"(#loc)) +#loc75 = loc("out_ptr2"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc79 = loc(callsite(#loc2 at #loc3)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xmask = arith.constant 2304 : i32 loc(#loc78) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc79) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc2) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc2) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc2) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc2) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc2) + %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc2) + %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc2) + %cst_5 = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc80) + %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc78) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc81) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc82) + %tmp3_weight:3 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%tmp3_mean = %cst_0, %tmp3_m2 = %cst_0, %tmp3_weight_8 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc84) + %r0_index_9 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc84) + %r0_mask = arith.cmpi slt, %r0_index_9, %cst_5 : tensor<1x2048xi32> loc(#loc85) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc86) + %tmp0_10 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc135) + %tmp0_11 = arith.addi %r0_index_9, %tmp0_10 : tensor<1x2048xi32> loc(#loc87) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc88) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc88) + %tmp0_14 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc136) + %tmp0_15 = arith.andi %r0_mask, %tmp0_14 : tensor<1x2048xi1> loc(#loc89) + %tmp0_16 = tt.load %tmp0_13, %tmp0_15, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc90) + %tmp0_17 = arith.extf %tmp0_16 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc91) + %1 = arith.cmpi eq, %r0_offset, %c0_i32 : i32 loc(#loc16) + %2:3 = scf.if %1 -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) { + scf.yield %cst_0, %tmp0_17, %cst_4 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc161) + } else { + %delta = arith.subf %tmp0_17, %tmp3_mean : tensor<1x2048xf32> loc(#loc138) + %new_weight = arith.addf %tmp3_weight_8, %cst_4 : tensor<1x2048xf32> loc(#loc162) + %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32> loc(#loc140) + %new_mean_21 = arith.addf %tmp3_mean, %new_mean : tensor<1x2048xf32> loc(#loc163) + %new_m2 = arith.subf %tmp0_17, %new_mean_21 : tensor<1x2048xf32> loc(#loc142) + %new_m2_22 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc143) + %new_m2_23 = arith.addf %tmp3_m2, %new_m2_22 : tensor<1x2048xf32> loc(#loc164) + scf.yield %new_m2_23, %new_mean_21, %new_weight : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc145) + } loc(#loc92) + %tmp3_mean_18 = arith.select %tmp0_15, %2#1, %tmp3_mean : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc101) + %tmp3_m2_19 = arith.select %tmp0_15, %2#0, %tmp3_m2 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc102) + %tmp3_weight_20 = arith.select %tmp0_15, %2#2, %tmp3_weight_8 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc103) + scf.yield %tmp3_mean_18, %tmp3_m2_19, %tmp3_weight_20 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc30) + } loc(#loc160) + %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc2 at #loc3)), %arg7: f32 loc(callsite(#loc2 at #loc3)), %arg8: f32 loc(callsite(#loc2 at #loc3)), %arg9: f32 loc(callsite(#loc2 at #loc3)), %arg10: f32 loc(callsite(#loc2 at #loc3)), %arg11: f32 loc(callsite(#loc2 at #loc3))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc146) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc147) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc148) + %w2_over_w_8 = arith.divf %arg11, %new_weight : f32 loc(#loc149) + %w2_over_w_9 = arith.select %w2_over_w, %cst, %w2_over_w_8 : f32 loc(#loc150) + %1 = arith.mulf %delta, %w2_over_w_9 : f32 loc(#loc151) + %2 = arith.addf %arg6, %1 : f32 loc(#loc152) + %3 = arith.addf %arg7, %arg10 : f32 loc(#loc153) + %4 = arith.mulf %delta, %delta : f32 loc(#loc154) + %5 = arith.mulf %4, %arg8 : f32 loc(#loc155) + %6 = arith.mulf %5, %w2_over_w_9 : f32 loc(#loc156) + %7 = arith.addf %3, %6 : f32 loc(#loc157) + tt.reduce.return %2, %7, %new_weight : f32, f32, f32 loc(#loc104) + }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc104) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc110) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc111) + scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc112) + %r0_index_8 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc112) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst_5 : tensor<1x2048xi32> loc(#loc113) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc114) + %tmp9_9 = tt.addptr %tmp9, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc114) + %tmp9_10 = tt.load %tmp9_9, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc115) + %tmp9_11 = arith.extf %tmp9_10 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc116) + %tmp12 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc117) + %tmp12_12 = tt.splat %tmp12 : i32 -> tensor<1x2048xi32> loc(#loc158) + %tmp12_13 = arith.addi %r0_index_8, %tmp12_12 : tensor<1x2048xi32> loc(#loc118) + %tmp12_14 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc119) + %tmp12_15 = tt.addptr %tmp12_14, %tmp12_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc119) + %tmp12_16 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc159) + %tmp12_17 = arith.andi %r0_mask, %tmp12_16 : tensor<1x2048xi1> loc(#loc120) + %tmp12_18 = tt.load %tmp12_15, %tmp12_17, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc121) + %tmp12_19 = arith.extf %tmp12_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc122) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc123) + %tmp23_20 = tt.addptr %tmp23, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc123) + %tmp23_21 = tt.load %tmp23_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc124) + %tmp23_22 = arith.extf %tmp23_21 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc125) + %tmp11 = arith.addf %tmp9_11, %cst_4 : tensor<1x2048xf32> loc(#loc126) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc127) + %tmp14_23 = arith.subf %tmp12_19, %tmp14 : tensor<1x2048xf32> loc(#loc127) + %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc128) + %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc129) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc130) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc131) + %tmp20_24 = arith.mulf %tmp14_23, %tmp20 : tensor<1x2048xf32> loc(#loc131) + %tmp22 = arith.mulf %tmp11, %tmp20_24 : tensor<1x2048xf32> loc(#loc132) + %tmp24 = arith.addf %tmp22, %tmp23_22 : tensor<1x2048xf32> loc(#loc133) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc69) + %2 = tt.addptr %1, %tmp12_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc69) + %3 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc70) + tt.store %2, %3, %tmp12_17 : tensor<1x2048x!tt.ptr> loc(#loc70) + } loc(#loc46) + tt.return loc(#loc71) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:27) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":32:43) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":33:31) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:62) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:51) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:58) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:8) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:43) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":52:31) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":53:29) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:47) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:42) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:35) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:62) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4) +#loc78 = loc("xmask"(#loc1)) +#loc80 = loc("xoffset"(#loc4)) +#loc81 = loc("r0_base"(#loc5)) +#loc82 = loc("r0_base"(#loc6)) +#loc83 = loc("tmp3_mean"(#loc7)) +#loc84 = loc("r0_index"(#loc8)) +#loc85 = loc("r0_mask"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp0"(#loc13)) +#loc90 = loc("tmp0"(#loc14)) +#loc91 = loc("tmp0"(#loc15)) +#loc92 = loc(callsite(#loc17 at #loc18)) +#loc93 = loc("new_m2"(#loc19)) +#loc94 = loc("delta"(#loc20)) +#loc95 = loc("new_weight"(#loc21)) +#loc96 = loc("new_mean"(#loc22)) +#loc97 = loc("new_mean"(#loc23)) +#loc98 = loc("new_m2"(#loc24)) +#loc99 = loc("new_m2"(#loc25)) +#loc100 = loc("new_m2"(#loc26)) +#loc101 = loc("tmp3_mean"(#loc27)) +#loc102 = loc("tmp3_m2"(#loc28)) +#loc103 = loc("tmp3_weight"(#loc29)) +#loc104 = loc(callsite(#loc31 at #loc3)) +#loc105 = loc("delta"(#loc32)) +#loc106 = loc("new_weight"(#loc33)) +#loc107 = loc("w2_over_w"(#loc34)) +#loc108 = loc("w2_over_w"(#loc35)) +#loc109 = loc("w2_over_w"(#loc36)) +#loc110 = loc("tmp3"(#loc44)) +#loc111 = loc("tmp7"(#loc45)) +#loc112 = loc("r0_index"(#loc47)) +#loc113 = loc("r0_mask"(#loc48)) +#loc114 = loc("tmp9"(#loc49)) +#loc115 = loc("tmp9"(#loc50)) +#loc116 = loc("tmp9"(#loc51)) +#loc117 = loc("tmp12"(#loc52)) +#loc118 = loc("tmp12"(#loc53)) +#loc119 = loc("tmp12"(#loc54)) +#loc120 = loc("tmp12"(#loc55)) +#loc121 = loc("tmp12"(#loc56)) +#loc122 = loc("tmp12"(#loc57)) +#loc123 = loc("tmp23"(#loc58)) +#loc124 = loc("tmp23"(#loc59)) +#loc125 = loc("tmp23"(#loc60)) +#loc126 = loc("tmp11"(#loc61)) +#loc127 = loc("tmp14"(#loc62)) +#loc128 = loc("tmp16"(#loc63)) +#loc129 = loc("tmp18"(#loc64)) +#loc130 = loc("tmp19"(#loc65)) +#loc131 = loc("tmp20"(#loc66)) +#loc132 = loc("tmp22"(#loc67)) +#loc133 = loc("tmp24"(#loc68)) +#loc134 = loc("tmp3_m2"(#loc83)) +#loc135 = loc(fused[#loc87, #loc86]) +#loc136 = loc(fused[#loc89, #loc78]) +#loc137 = loc("new_m2"(#loc93)) +#loc138 = loc(callsite(#loc94 at #loc18)) +#loc139 = loc("new_weight"(#loc95)) +#loc140 = loc(callsite(#loc96 at #loc18)) +#loc141 = loc("new_mean"(#loc97)) +#loc142 = loc(callsite(#loc98 at #loc18)) +#loc143 = loc(callsite(#loc99 at #loc18)) +#loc144 = loc("new_m2"(#loc100)) +#loc145 = loc(callsite(#loc100 at #loc18)) +#loc146 = loc(callsite(#loc105 at #loc104)) +#loc147 = loc(callsite(#loc106 at #loc104)) +#loc148 = loc(callsite(#loc107 at #loc104)) +#loc149 = loc(callsite(#loc108 at #loc104)) +#loc150 = loc(callsite(#loc109 at #loc104)) +#loc151 = loc(callsite(#loc37 at #loc104)) +#loc152 = loc(callsite(#loc38 at #loc104)) +#loc153 = loc(callsite(#loc39 at #loc104)) +#loc154 = loc(callsite(#loc40 at #loc104)) +#loc155 = loc(callsite(#loc41 at #loc104)) +#loc156 = loc(callsite(#loc42 at #loc104)) +#loc157 = loc(callsite(#loc43 at #loc104)) +#loc158 = loc(fused[#loc118, #loc117]) +#loc159 = loc(fused[#loc120, #loc78]) +#loc160 = loc("tmp3_weight"(#loc134)) +#loc161 = loc(callsite(#loc137 at #loc18)) +#loc162 = loc(callsite(#loc139 at #loc18)) +#loc163 = loc(callsite(#loc141 at #loc18)) +#loc164 = loc(callsite(#loc144 at #loc18)) diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b440f6895dc388cb40c3999ebd3b8ee37e8c8fd5 --- /dev/null +++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json"}} \ No newline at end of file diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..41afac6714899d52b2b8f5db21d1797580c8e5f9 Binary files /dev/null and b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin differ diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9d4aac61231b80282dbad2829700cf1cb258cc6b --- /dev/null +++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json @@ -0,0 +1 @@ +{"hash": "92745eb484786459cdf592edad39c8cc23d2be7dc5a548b0b179d760173da87e", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1"} \ No newline at end of file diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..5331a368fe28413c50f3abcc954b469f66ad1578 --- /dev/null +++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir @@ -0,0 +1,123 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 10, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = shl nuw nsw i32 %9, 3, !dbg !9 + %11 = and i32 %10, 1016, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = or i32 %8, %10, !dbg !9 + %14 = or disjoint i32 %13, 1, !dbg !10 + %15 = or disjoint i32 %13, 2, !dbg !10 + %16 = or disjoint i32 %13, 3, !dbg !10 + %17 = or disjoint i32 %13, 4, !dbg !10 + %18 = or disjoint i32 %13, 5, !dbg !10 + %19 = or disjoint i32 %13, 6, !dbg !10 + %20 = or disjoint i32 %13, 7, !dbg !10 + %21 = sdiv i32 %12, 128, !dbg !11 + %22 = mul i32 %21, 128, !dbg !12 + %.decomposed = sub i32 %12, %22, !dbg !12 + %23 = srem i32 %14, 128, !dbg !12 + %24 = srem i32 %15, 128, !dbg !12 + %25 = srem i32 %16, 128, !dbg !12 + %26 = srem i32 %17, 128, !dbg !12 + %27 = srem i32 %18, 128, !dbg !12 + %28 = srem i32 %19, 128, !dbg !12 + %29 = srem i32 %20, 128, !dbg !12 + %30 = srem i32 %21, 2304, !dbg !13 + %31 = sdiv i32 %12, 294912, !dbg !14 + %32 = shl nsw i32 %31, 7, !dbg !15 + %33 = add nsw i32 %32, %.decomposed, !dbg !16 + %34 = add nsw i32 %32, %23, !dbg !16 + %35 = add nsw i32 %32, %24, !dbg !16 + %36 = add nsw i32 %32, %25, !dbg !16 + %37 = add nsw i32 %32, %26, !dbg !16 + %38 = add nsw i32 %32, %27, !dbg !16 + %39 = add nsw i32 %32, %28, !dbg !16 + %40 = add nsw i32 %32, %29, !dbg !16 + %41 = sext i32 %30 to i64, !dbg !17 + %42 = mul i64 %2, %41, !dbg !17 + %43 = sext i32 %33 to i64, !dbg !18 + %44 = sext i32 %34 to i64, !dbg !18 + %45 = sext i32 %35 to i64, !dbg !18 + %46 = sext i32 %36 to i64, !dbg !18 + %47 = sext i32 %37 to i64, !dbg !18 + %48 = sext i32 %38 to i64, !dbg !18 + %49 = sext i32 %39 to i64, !dbg !18 + %50 = sext i32 %40 to i64, !dbg !18 + %51 = getelementptr bfloat, ptr addrspace(1) %0, i64 %42, !dbg !19 + %52 = getelementptr bfloat, ptr addrspace(1) %51, i64 %43, !dbg !19 + %53 = getelementptr bfloat, ptr addrspace(1) %51, i64 %44, !dbg !19 + %54 = getelementptr bfloat, ptr addrspace(1) %51, i64 %45, !dbg !19 + %55 = getelementptr bfloat, ptr addrspace(1) %51, i64 %46, !dbg !19 + %56 = getelementptr bfloat, ptr addrspace(1) %51, i64 %47, !dbg !19 + %57 = getelementptr bfloat, ptr addrspace(1) %51, i64 %48, !dbg !19 + %58 = getelementptr bfloat, ptr addrspace(1) %51, i64 %49, !dbg !19 + %59 = getelementptr bfloat, ptr addrspace(1) %51, i64 %50, !dbg !19 + %60 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %52) #2, !dbg !20 + %61 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %53) #2, !dbg !20 + %62 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %54) #2, !dbg !20 + %63 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %55) #2, !dbg !20 + %64 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %56) #2, !dbg !20 + %65 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %57) #2, !dbg !20 + %66 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %58) #2, !dbg !20 + %67 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %59) #2, !dbg !20 + %68 = sext i32 %12 to i64, !dbg !21 + %69 = getelementptr bfloat, ptr addrspace(1) %1, i64 %68, !dbg !21 + %70 = insertelement <2 x i16> poison, i16 %60, i64 0, !dbg !22 + %71 = insertelement <2 x i16> %70, i16 %61, i64 1, !dbg !22 + %72 = bitcast <2 x i16> %71 to i32, !dbg !22 + %73 = insertelement <2 x i16> poison, i16 %62, i64 0, !dbg !22 + %74 = insertelement <2 x i16> %73, i16 %63, i64 1, !dbg !22 + %75 = bitcast <2 x i16> %74 to i32, !dbg !22 + %76 = insertelement <2 x i16> poison, i16 %64, i64 0, !dbg !22 + %77 = insertelement <2 x i16> %76, i16 %65, i64 1, !dbg !22 + %78 = bitcast <2 x i16> %77 to i32, !dbg !22 + %79 = insertelement <2 x i16> poison, i16 %66, i64 0, !dbg !22 + %80 = insertelement <2 x i16> %79, i16 %67, i64 1, !dbg !22 + %81 = bitcast <2 x i16> %80 to i32, !dbg !22 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %72, i32 %75, i32 %78, i32 %81, ptr addrspace(1) %69) #2, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1", linkageName: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 23, column: 19, scope: !4) +!13 = !DILocation(line: 24, column: 28, scope: !4) +!14 = !DILocation(line: 25, column: 19, scope: !4) +!15 = !DILocation(line: 27, column: 39, scope: !4) +!16 = !DILocation(line: 27, column: 35, scope: !4) +!17 = !DILocation(line: 27, column: 48, scope: !4) +!18 = !DILocation(line: 27, column: 44, scope: !4) +!19 = !DILocation(line: 27, column: 30, scope: !4) +!20 = !DILocation(line: 27, column: 53, scope: !4) +!21 = !DILocation(line: 28, column: 25, scope: !4) +!22 = !DILocation(line: 28, column: 36, scope: !4) +!23 = !DILocation(line: 28, column: 4, scope: !4) diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..dbcbbe9ecfff9d5358bf42cde236b49b4e68b7e3 --- /dev/null +++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx @@ -0,0 +1,412 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1 // -- Begin function triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1 + // @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1 +.visible .entry triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1( + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_1, + .param .u64 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_2, + .param .u32 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_5 +) +.reqntid 128 +{ + .reg .b16 %rs<9>; + .reg .b32 %r<65>; + .reg .b64 %rd<17>; + .loc 1 18 0 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:18:0 + +// %bb.0: + ld.param.b64 %rd10, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_0]; + ld.param.b64 %rd11, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_1]; +$L__tmp0: + .loc 1 20 28 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:20:28 + mov.u32 %r5, %ctaid.x; + .loc 1 20 33 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:20:33 + shl.b32 %r6, %r5, 10; + ld.param.b64 %rd12, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_2]; + .loc 1 21 36 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:36 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 3; + and.b32 %r9, %r8, 1016; + .loc 1 21 23 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:23 + or.b32 %r10, %r9, %r6; + .loc 1 21 36 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:36 + or.b32 %r11, %r6, %r8; + .loc 1 21 23 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:23 + or.b32 %r12, %r11, 1; + or.b32 %r13, %r11, 2; + or.b32 %r14, %r11, 3; + or.b32 %r15, %r11, 4; + or.b32 %r16, %r11, 5; + or.b32 %r17, %r11, 6; + or.b32 %r18, %r11, 7; + .loc 1 24 21 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:21 + bfe.s32 %r19, %r5, 21, 1; + .loc 1 23 19 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:23:19 + shr.u32 %r20, %r19, 25; + .loc 1 24 21 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:21 + add.s32 %r21, %r10, %r20; + shr.s32 %r22, %r21, 7; + .loc 1 23 19 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:23:19 + and.b32 %r23, %r21, -128; + sub.s32 %r24, %r10, %r23; + add.s32 %r25, %r12, %r20; + and.b32 %r26, %r25, -128; + sub.s32 %r27, %r12, %r26; + add.s32 %r28, %r13, %r20; + and.b32 %r29, %r28, -128; + sub.s32 %r30, %r13, %r29; + add.s32 %r31, %r14, %r20; + and.b32 %r32, %r31, -128; + sub.s32 %r33, %r14, %r32; + add.s32 %r34, %r15, %r20; + and.b32 %r35, %r34, -128; + sub.s32 %r36, %r15, %r35; + add.s32 %r37, %r16, %r20; + and.b32 %r38, %r37, -128; + sub.s32 %r39, %r16, %r38; + add.s32 %r40, %r17, %r20; + and.b32 %r41, %r40, -128; + sub.s32 %r42, %r17, %r41; + add.s32 %r43, %r18, %r20; + and.b32 %r44, %r43, -128; + sub.s32 %r45, %r18, %r44; + .loc 1 24 28 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:28 + mul.hi.s32 %r46, %r22, 954437177; + shr.u32 %r47, %r46, 31; + shr.s32 %r48, %r46, 9; + add.s32 %r49, %r48, %r47; + mul.lo.s32 %r50, %r49, 2304; + sub.s32 %r51, %r22, %r50; + .loc 1 25 19 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:25:19 + mul.hi.s32 %r52, %r10, 954437177; + shr.u32 %r53, %r52, 31; + shr.s32 %r54, %r52, 16; + add.s32 %r55, %r54, %r53; + .loc 1 27 39 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:39 + shl.b32 %r56, %r55, 7; + .loc 1 27 35 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:35 + add.s32 %r57, %r56, %r24; + add.s32 %r58, %r56, %r27; + add.s32 %r59, %r56, %r30; + add.s32 %r60, %r56, %r33; + add.s32 %r61, %r56, %r36; + add.s32 %r62, %r56, %r39; + add.s32 %r63, %r56, %r42; + add.s32 %r64, %r56, %r45; + .loc 1 27 48 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:48 + cvt.s64.s32 %rd13, %r51; + mul.lo.s64 %rd14, %rd12, %rd13; + .loc 1 27 30 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:30 + shl.b64 %rd15, %rd14, 1; + add.s64 %rd16, %rd10, %rd15; + mad.wide.s32 %rd1, %r57, 2, %rd16; + mad.wide.s32 %rd2, %r58, 2, %rd16; + mad.wide.s32 %rd3, %r59, 2, %rd16; + mad.wide.s32 %rd4, %r60, 2, %rd16; + mad.wide.s32 %rd5, %r61, 2, %rd16; + mad.wide.s32 %rd6, %r62, 2, %rd16; + mad.wide.s32 %rd7, %r63, 2, %rd16; + mad.wide.s32 %rd8, %r64, 2, %rd16; + .loc 1 27 53 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:53 + // begin inline asm + mov.u16 %rs1, 0x0; + ld.global.b16 { %rs1 }, [ %rd1 + 0 ]; + // end inline asm + // begin inline asm + mov.u16 %rs2, 0x0; + ld.global.b16 { %rs2 }, [ %rd2 + 0 ]; + // end inline asm + // begin inline asm + mov.u16 %rs3, 0x0; + ld.global.b16 { %rs3 }, [ %rd3 + 0 ]; + // end inline asm + // begin inline asm + mov.u16 %rs4, 0x0; + ld.global.b16 { %rs4 }, [ %rd4 + 0 ]; + // end inline asm + // begin inline asm + mov.u16 %rs5, 0x0; + ld.global.b16 { %rs5 }, [ %rd5 + 0 ]; + // end inline asm + // begin inline asm + mov.u16 %rs6, 0x0; + ld.global.b16 { %rs6 }, [ %rd6 + 0 ]; + // end inline asm + // begin inline asm + mov.u16 %rs7, 0x0; + ld.global.b16 { %rs7 }, [ %rd7 + 0 ]; + // end inline asm + // begin inline asm + mov.u16 %rs8, 0x0; + ld.global.b16 { %rs8 }, [ %rd8 + 0 ]; + // end inline asm + .loc 1 28 25 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:25 + mad.wide.s32 %rd9, %r10, 2, %rd11; + .loc 1 28 36 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:36 + mov.b32 %r1, {%rs1, %rs2}; + mov.b32 %r2, {%rs3, %rs4}; + mov.b32 %r3, {%rs5, %rs6}; + mov.b32 %r4, {%rs7, %rs8}; + // begin inline asm + st.global.v4.b32 [ %rd9 + 0 ], { %r1, %r2, %r3, %r4 }; + // end inline asm + .loc 1 28 4 // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 51 +.b8 118 +.b8 106 +.b8 105 +.b8 108 +.b8 118 +.b8 99 +.b8 121 +.b8 55 +.b8 115 +.b8 100 +.b8 113 +.b8 99 +.b8 97 +.b8 120 +.b8 102 +.b8 115 +.b8 112 +.b8 102 +.b8 102 +.b8 97 +.b8 100 +.b8 98 +.b8 115 +.b8 114 +.b8 121 +.b8 51 +.b8 115 +.b8 113 +.b8 109 +.b8 52 +.b8 106 +.b8 55 +.b8 113 +.b8 112 +.b8 54 +.b8 117 +.b8 51 +.b8 116 +.b8 117 +.b8 115 +.b8 114 +.b8 54 +.b8 112 +.b8 51 +.b8 52 +.b8 115 +.b8 98 +.b8 105 +.b8 97 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 51 +.b8 118 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source new file mode 100644 index 0000000000000000000000000000000000000000..199c64a2732f37f7f42b0fb3f8cc301c559bcc0a --- /dev/null +++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source @@ -0,0 +1,91 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("out_ptr0"(#loc)) +#loc23 = loc("ks0"(#loc)) +#loc24 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc25) + %xoffset = tt.get_program_id x : i32 loc(#loc26) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc27) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc27) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc27) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc28) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc29) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc29) + %xmask = arith.constant true loc(#loc30) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc30) + %x0 = arith.constant 128 : i32 loc(#loc31) + %x0_7 = arith.constant 128 : i32 loc(#loc31) + %x0_8 = arith.constant dense<128> : tensor<1024xi32> loc(#loc31) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc31) + %x1 = arith.constant 128 : i32 loc(#loc32) + %x1_10 = arith.constant 128 : i32 loc(#loc32) + %x1_11 = arith.constant dense<128> : tensor<1024xi32> loc(#loc32) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc32) + %x1_13 = arith.constant 2304 : i32 loc(#loc33) + %x1_14 = arith.constant 2304 : i32 loc(#loc33) + %x1_15 = arith.constant dense<2304> : tensor<1024xi32> loc(#loc33) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1024xi32> loc(#loc33) + %x2 = arith.constant 294912 : i32 loc(#loc34) + %x2_17 = arith.constant 294912 : i32 loc(#loc34) + %x2_18 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc34) + %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<1024xi32> loc(#loc34) + %tmp0 = arith.constant 128 : i32 loc(#loc35) + %tmp0_20 = arith.constant 128 : i32 loc(#loc35) + %tmp0_21 = arith.constant dense<128> : tensor<1024xi32> loc(#loc35) + %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<1024xi32> loc(#loc35) + %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<1024xi32> loc(#loc36) + %tmp0_24 = arith.extsi %x1_16 : tensor<1024xi32> to tensor<1024xi64> loc(#loc37) + %tmp0_25 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc37) + %tmp0_26 = arith.muli %tmp0_25, %tmp0_24 : tensor<1024xi64> loc(#loc37) + %tmp0_27 = arith.extsi %tmp0_23 : tensor<1024xi32> to tensor<1024xi64> loc(#loc38) + %tmp0_28 = arith.addi %tmp0_27, %tmp0_26 : tensor<1024xi64> loc(#loc38) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc39) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc39) + %tmp0_31 = tt.load %tmp0_30 : tensor<1024x!tt.ptr> loc(#loc40) + %tmp0_32 = arith.extf %tmp0_31 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc18) + %2 = arith.truncf %tmp0_32 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:62) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4) +#loc25 = loc("xnumel"(#loc1)) +#loc26 = loc("xoffset"(#loc2)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xindex"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("xmask"(#loc6)) +#loc31 = loc("x0"(#loc7)) +#loc32 = loc("x1"(#loc8)) +#loc33 = loc("x1"(#loc9)) +#loc34 = loc("x2"(#loc10)) +#loc35 = loc("tmp0"(#loc11)) +#loc36 = loc("tmp0"(#loc12)) +#loc37 = loc("tmp0"(#loc13)) +#loc38 = loc("tmp0"(#loc14)) +#loc39 = loc("tmp0"(#loc15)) +#loc40 = loc("tmp0"(#loc16)) +#loc41 = loc("tmp0"(#loc17)) diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..3af54dbba1279f97ad547ccf9297191e53fb9d47 --- /dev/null +++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir @@ -0,0 +1,69 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("ks0"(#loc)) +#loc22 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<2304> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<294912> : tensor<1024xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc23) + %xoffset_2 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc24) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc25) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32, #blocked> loc(#loc26) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32, #blocked> loc(#loc26) + %x0 = arith.remsi %xindex_4, %cst : tensor<1024xi32, #blocked> loc(#loc27) + %x1 = arith.divsi %xindex_4, %cst : tensor<1024xi32, #blocked> loc(#loc28) + %x1_5 = arith.remsi %x1, %cst_0 : tensor<1024xi32, #blocked> loc(#loc29) + %x2 = arith.divsi %xindex_4, %cst_1 : tensor<1024xi32, #blocked> loc(#loc30) + %tmp0 = arith.muli %x2, %cst : tensor<1024xi32, #blocked> loc(#loc31) + %tmp0_6 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc32) + %tmp0_7 = arith.extsi %x1_5 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc33) + %tmp0_8 = tt.splat %ks0 : i64 -> tensor<1024xi64, #blocked> loc(#loc33) + %tmp0_9 = arith.muli %tmp0_8, %tmp0_7 : tensor<1024xi64, #blocked> loc(#loc33) + %tmp0_10 = arith.extsi %tmp0_6 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc34) + %tmp0_11 = arith.addi %tmp0_10, %tmp0_9 : tensor<1024xi64, #blocked> loc(#loc34) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc35) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc35) + %tmp0_14 = tt.load %tmp0_13 : tensor<1024x!tt.ptr, #blocked> loc(#loc36) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc16) + %1 = tt.addptr %0, %xindex_4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc16) + tt.store %1, %tmp0_14 : tensor<1024x!tt.ptr, #blocked> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4) +#loc23 = loc("xoffset"(#loc2)) +#loc24 = loc("xoffset"(#loc3)) +#loc25 = loc("xindex"(#loc4)) +#loc26 = loc("xindex"(#loc5)) +#loc27 = loc("x0"(#loc6)) +#loc28 = loc("x1"(#loc7)) +#loc29 = loc("x1"(#loc8)) +#loc30 = loc("x2"(#loc9)) +#loc31 = loc("tmp0"(#loc10)) +#loc32 = loc("tmp0"(#loc11)) +#loc33 = loc("tmp0"(#loc12)) +#loc34 = loc("tmp0"(#loc13)) +#loc35 = loc("tmp0"(#loc14)) +#loc36 = loc("tmp0"(#loc15)) diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..695072be3d76b7acc3a4f38b021280208523f65e --- /dev/null +++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir @@ -0,0 +1,68 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("ks0"(#loc)) +#loc22 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %x2 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc23) + %x1 = arith.constant dense<2304> : tensor<1024xi32> loc(#loc24) + %cst = arith.constant dense<128> : tensor<1024xi32> loc(#loc3) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc26) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc28) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc28) + %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32> loc(#loc29) + %x1_3 = arith.divsi %xindex_2, %cst : tensor<1024xi32> loc(#loc30) + %x1_4 = arith.remsi %x1_3, %x1 : tensor<1024xi32> loc(#loc24) + %x2_5 = arith.divsi %xindex_2, %x2 : tensor<1024xi32> loc(#loc23) + %tmp0 = arith.muli %x2_5, %cst : tensor<1024xi32> loc(#loc31) + %tmp0_6 = arith.addi %x0, %tmp0 : tensor<1024xi32> loc(#loc32) + %tmp0_7 = arith.extsi %x1_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc33) + %tmp0_8 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc33) + %tmp0_9 = arith.muli %tmp0_8, %tmp0_7 : tensor<1024xi64> loc(#loc33) + %tmp0_10 = arith.extsi %tmp0_6 : tensor<1024xi32> to tensor<1024xi64> loc(#loc34) + %tmp0_11 = arith.addi %tmp0_10, %tmp0_9 : tensor<1024xi64> loc(#loc34) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc35) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc35) + %tmp0_14 = tt.load %tmp0_13 : tensor<1024x!tt.ptr> loc(#loc36) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc16) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc16) + tt.store %1, %tmp0_14 : tensor<1024x!tt.ptr> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28) +#loc3 = loc(unknown) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4) +#loc23 = loc("x2"(#loc1)) +#loc24 = loc("x1"(#loc2)) +#loc25 = loc("xoffset"(#loc4)) +#loc26 = loc("xoffset"(#loc5)) +#loc27 = loc("xindex"(#loc6)) +#loc28 = loc("xindex"(#loc7)) +#loc29 = loc("x0"(#loc8)) +#loc30 = loc("x1"(#loc9)) +#loc31 = loc("tmp0"(#loc10)) +#loc32 = loc("tmp0"(#loc11)) +#loc33 = loc("tmp0"(#loc12)) +#loc34 = loc("tmp0"(#loc13)) +#loc35 = loc("tmp0"(#loc14)) +#loc36 = loc("tmp0"(#loc15)) diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/__grp__triton_poi_fused_clone_0.json b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/__grp__triton_poi_fused_clone_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5a91cb061705d4a758630b7bef07113222f472eb --- /dev/null +++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/__grp__triton_poi_fused_clone_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_clone_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.source", "triton_poi_fused_clone_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttir", "triton_poi_fused_clone_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttgir", "triton_poi_fused_clone_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.llir", "triton_poi_fused_clone_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ptx", "triton_poi_fused_clone_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.cubin", "triton_poi_fused_clone_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.json"}} \ No newline at end of file diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.cubin b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..055052cc157b8fe4afe03346b48c610808954ce5 Binary files /dev/null and b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.cubin differ diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.json b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f3dd9bf66d959d8878c3edb690f9686c2a48590b --- /dev/null +++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.json @@ -0,0 +1 @@ +{"hash": "9590ed34b2c1f6d55fc4b2d376fecdd40fa57c936ed24d0611166eb42074b39c", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_0"} \ No newline at end of file diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.llir b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..4e49e98d7484e2bf4e86c108d16128fc6ef3b2ba --- /dev/null +++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.llir @@ -0,0 +1,49 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_clone_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 9, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 1, !dbg !9 + %10 = and i32 %9, 510, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = sext i32 %11 to i64, !dbg !11 + %13 = getelementptr bfloat, ptr addrspace(1) %0, i64 %12, !dbg !11 + %14 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %13) #2, !dbg !12 + %15 = getelementptr bfloat, ptr addrspace(1) %1, i64 %12, !dbg !13 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %14, ptr addrspace(1) %15) #2, !dbg !14 + ret void, !dbg !15 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_0", linkageName: "triton_poi_fused_clone_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 30, scope: !4) +!12 = !DILocation(line: 24, column: 35, scope: !4) +!13 = !DILocation(line: 25, column: 25, scope: !4) +!14 = !DILocation(line: 25, column: 36, scope: !4) +!15 = !DILocation(line: 25, column: 4, scope: !4) diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ptx b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..600b92ae91bba525917638c0ed6e7d3e0a212bd0 --- /dev/null +++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ptx @@ -0,0 +1,302 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_clone_0 // -- Begin function triton_poi_fused_clone_0 + // @triton_poi_fused_clone_0 +.visible .entry triton_poi_fused_clone_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_1, + .param .u32 triton_poi_fused_clone_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_4 +) +.reqntid 256 +{ + .reg .b32 %r<8>; + .reg .b64 %rd<6>; + .loc 1 18 0 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_poi_fused_clone_0_param_0]; + ld.param.b64 %rd4, [triton_poi_fused_clone_0_param_1]; +$L__tmp0: + .loc 1 20 28 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:20:28 + mov.u32 %r2, %ctaid.x; + .loc 1 20 33 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:20:33 + shl.b32 %r3, %r2, 9; + .loc 1 21 36 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:21:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 21 23 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:21:23 + or.b32 %r7, %r6, %r3; + .loc 1 24 30 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:24:30 + mul.wide.s32 %rd5, %r7, 2; + add.s64 %rd1, %rd3, %rd5; + .loc 1 24 35 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:24:35 + // begin inline asm + mov.u32 %r1, 0x0; + ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 25 25 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:25 + add.s64 %rd2, %rd4, %rd5; + .loc 1 25 36 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:36 + // begin inline asm + st.global.b32 [ %rd2 + 0 ], { %r1 }; + // end inline asm + .loc 1 25 4 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 99 +.b8 122 +.b8 103 +.b8 55 +.b8 116 +.b8 112 +.b8 105 +.b8 116 +.b8 117 +.b8 112 +.b8 114 +.b8 119 +.b8 103 +.b8 113 +.b8 112 +.b8 117 +.b8 97 +.b8 106 +.b8 122 +.b8 121 +.b8 50 +.b8 110 +.b8 121 +.b8 108 +.b8 102 +.b8 107 +.b8 52 +.b8 51 +.b8 109 +.b8 100 +.b8 111 +.b8 122 +.b8 100 +.b8 53 +.b8 118 +.b8 119 +.b8 111 +.b8 55 +.b8 55 +.b8 109 +.b8 117 +.b8 113 +.b8 51 +.b8 107 +.b8 111 +.b8 115 +.b8 112 +.b8 110 +.b8 102 +.b8 55 +.b8 98 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 99 +.b8 122 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.source b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.source new file mode 100644 index 0000000000000000000000000000000000000000..0278f8e8405a4b1bfa1acbdcd0f84a4a64473918 --- /dev/null +++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.source @@ -0,0 +1,48 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0) +#loc13 = loc("in_ptr0"(#loc)) +#loc14 = loc("out_ptr0"(#loc)) +#loc15 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 8388608 : i32 loc(#loc16) + %xoffset = tt.get_program_id x : i32 loc(#loc17) + %xoffset_1 = arith.constant 512 : i32 loc(#loc18) + %xoffset_2 = arith.constant 512 : i32 loc(#loc18) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc18) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc19) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc20) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc20) + %xmask = arith.constant true loc(#loc21) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc21) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc22) + %tmp0_7 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc22) + %tmp0_8 = tt.load %tmp0_7 : tensor<512x!tt.ptr> loc(#loc23) + %tmp0_9 = arith.extf %tmp0_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc24) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc10) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc10) + %2 = arith.truncf %tmp0_9 : tensor<512xf32> to tensor<512xbf16> loc(#loc11) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc11) + tt.return loc(#loc12) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4) +#loc16 = loc("xnumel"(#loc1)) +#loc17 = loc("xoffset"(#loc2)) +#loc18 = loc("xoffset"(#loc3)) +#loc19 = loc("xindex"(#loc4)) +#loc20 = loc("xindex"(#loc5)) +#loc21 = loc("xmask"(#loc6)) +#loc22 = loc("tmp0"(#loc7)) +#loc23 = loc("tmp0"(#loc8)) +#loc24 = loc("tmp0"(#loc9)) diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttgir b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..9e0bdd3bd4ab82744d77aea6cd7952c3e50475b5 --- /dev/null +++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttgir @@ -0,0 +1,38 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0) +#loc11 = loc("in_ptr0"(#loc)) +#loc12 = loc("out_ptr0"(#loc)) +#loc13 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc14) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc15) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc16) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32, #blocked> loc(#loc17) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32, #blocked> loc(#loc17) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc18) + %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc18) + %tmp0_4 = tt.load %tmp0_3 : tensor<512x!tt.ptr, #blocked> loc(#loc19) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc8) + tt.store %1, %tmp0_4 : tensor<512x!tt.ptr, #blocked> loc(#loc9) + tt.return loc(#loc10) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4) +#loc14 = loc("xoffset"(#loc2)) +#loc15 = loc("xoffset"(#loc3)) +#loc16 = loc("xindex"(#loc4)) +#loc17 = loc("xindex"(#loc5)) +#loc18 = loc("tmp0"(#loc6)) +#loc19 = loc("tmp0"(#loc7)) diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttir b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..103f351850775c2f0700eff25ca1a4616c55313d --- /dev/null +++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttir @@ -0,0 +1,37 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0) +#loc11 = loc("in_ptr0"(#loc)) +#loc12 = loc("out_ptr0"(#loc)) +#loc13 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc14) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc15) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc16) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc17) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc17) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc18) + %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc18) + %tmp0_4 = tt.load %tmp0_3 : tensor<512x!tt.ptr> loc(#loc19) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc8) + tt.store %1, %tmp0_4 : tensor<512x!tt.ptr> loc(#loc9) + tt.return loc(#loc10) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4) +#loc14 = loc("xoffset"(#loc2)) +#loc15 = loc("xoffset"(#loc3)) +#loc16 = loc("xindex"(#loc4)) +#loc17 = loc("xindex"(#loc5)) +#loc18 = loc("tmp0"(#loc6)) +#loc19 = loc("tmp0"(#loc7)) diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..083ac03b785483c0c6b04b854a1fcce889cd658c --- /dev/null +++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.json"}} \ No newline at end of file diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..b1846ac86154d050ac5ae2d56051eb2aac1822d0 Binary files /dev/null and b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.cubin differ diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..25a5084020b5275d21fb2a9f28e5c061831a4397 --- /dev/null +++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"hash": "9806a21d61634efc263f3b35c888fa94029b43a9db7412712bffad65533447c2", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"} \ No newline at end of file diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..d8495bd5c7e7bb482050746dacb50e85af071fc1 --- /dev/null +++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.llir @@ -0,0 +1,601 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { +__nv_rsqrtf.exit: + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %12 = icmp samesign ult i32 %11, 2048, !dbg !9 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %14 = and i32 %13, 511, !dbg !10 + %15 = and i32 %13, 31, !dbg !10 + %16 = lshr i32 %14, 5, !dbg !10 + %17 = shl nuw nsw i32 %13, 3, !dbg !10 + %18 = and i32 %17, 4088, !dbg !10 + %19 = shl i32 %11, 12, !dbg !11 + %20 = or disjoint i32 %18, %19, !dbg !12 + %21 = sext i32 %20 to i64, !dbg !13 + %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !13 + %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %22, i64 %23, i1 %12) #6, !dbg !14 + %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !14 + %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !14 + %27 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !14 + %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !14 + %29 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !14 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !14 + %31 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !14 + %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !14 + %33 = zext nneg i32 %18 to i64, !dbg !15 + %34 = getelementptr bfloat, ptr addrspace(1) %1, i64 %33, !dbg !15 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %34, i64 %35, i1 true) #6, !dbg !16 + %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !16 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !16 + %39 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !16 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !16 + %41 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !16 + %42 = bitcast i32 %41 to <2 x bfloat>, !dbg !16 + %43 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !16 + %44 = bitcast i32 %43 to <2 x bfloat>, !dbg !16 + %45 = getelementptr bfloat, ptr addrspace(1) %2, i64 %21, !dbg !17 + %46 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !18 + %47 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %45, i64 %46, i1 %12) #6, !dbg !18 + %48 = extractvalue { i32, i32, i32, i32 } %47, 0, !dbg !18 + %49 = bitcast i32 %48 to <2 x bfloat>, !dbg !18 + %50 = extractvalue { i32, i32, i32, i32 } %47, 1, !dbg !18 + %51 = bitcast i32 %50 to <2 x bfloat>, !dbg !18 + %52 = extractvalue { i32, i32, i32, i32 } %47, 2, !dbg !18 + %53 = bitcast i32 %52 to <2 x bfloat>, !dbg !18 + %54 = extractvalue { i32, i32, i32, i32 } %47, 3, !dbg !18 + %55 = bitcast i32 %54 to <2 x bfloat>, !dbg !18 + %56 = select i1 %12, float 1.000000e+00, float 0.000000e+00, !dbg !19 + %57 = getelementptr bfloat, ptr addrspace(1) %5, i64 %21, !dbg !20 + %58 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !21 + %59 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !22 + %60 = fpext <2 x bfloat> %49 to <2 x float>, !dbg !23 + %61 = fmul <2 x float> %59, %60, !dbg !24 + %62 = fadd <2 x float> %61, %58, !dbg !25 + %63 = extractelement <2 x float> %62, i64 0, !dbg !26 + %64 = select i1 %12, float %63, float 0.000000e+00, !dbg !26 + %65 = extractelement <2 x float> %62, i64 1, !dbg !26 + %66 = select i1 %12, float %65, float 0.000000e+00, !dbg !26 + %67 = fptrunc <2 x float> %62 to <2 x bfloat>, !dbg !27 + %68 = fpext <2 x bfloat> %28 to <2 x float>, !dbg !21 + %69 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !22 + %70 = fpext <2 x bfloat> %51 to <2 x float>, !dbg !23 + %71 = fmul <2 x float> %69, %70, !dbg !24 + %72 = fadd <2 x float> %71, %68, !dbg !25 + %73 = extractelement <2 x float> %72, i64 0, !dbg !26 + %74 = select i1 %12, float %73, float 0.000000e+00, !dbg !26 + %75 = extractelement <2 x float> %72, i64 1, !dbg !26 + %76 = select i1 %12, float %75, float 0.000000e+00, !dbg !26 + %77 = fptrunc <2 x float> %72 to <2 x bfloat>, !dbg !27 + %78 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !21 + %79 = fpext <2 x bfloat> %42 to <2 x float>, !dbg !22 + %80 = fpext <2 x bfloat> %53 to <2 x float>, !dbg !23 + %81 = fmul <2 x float> %79, %80, !dbg !24 + %82 = fadd <2 x float> %81, %78, !dbg !25 + %83 = extractelement <2 x float> %82, i64 0, !dbg !26 + %84 = select i1 %12, float %83, float 0.000000e+00, !dbg !26 + %85 = extractelement <2 x float> %82, i64 1, !dbg !26 + %86 = select i1 %12, float %85, float 0.000000e+00, !dbg !26 + %87 = fptrunc <2 x float> %82 to <2 x bfloat>, !dbg !27 + %88 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !21 + %89 = fpext <2 x bfloat> %44 to <2 x float>, !dbg !22 + %90 = fpext <2 x bfloat> %55 to <2 x float>, !dbg !23 + %91 = fmul <2 x float> %89, %90, !dbg !24 + %92 = fadd <2 x float> %91, %88, !dbg !25 + %93 = extractelement <2 x float> %92, i64 0, !dbg !26 + %94 = select i1 %12, float %93, float 0.000000e+00, !dbg !26 + %95 = extractelement <2 x float> %92, i64 1, !dbg !26 + %96 = select i1 %12, float %95, float 0.000000e+00, !dbg !26 + %97 = fptrunc <2 x float> %92 to <2 x bfloat>, !dbg !27 + %98 = bitcast <2 x bfloat> %67 to i32, !dbg !27 + %99 = bitcast <2 x bfloat> %77 to i32, !dbg !27 + %100 = bitcast <2 x bfloat> %87 to i32, !dbg !27 + %101 = bitcast <2 x bfloat> %97 to i32, !dbg !27 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %98, i32 %99, i32 %100, i32 %101, ptr addrspace(1) %57, i1 %12) #6, !dbg !27 + %102 = fsub float %66, %64, !dbg !28 + %103 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !34 + %104 = fcmp oeq float %103, 0.000000e+00, !dbg !35 + %105 = tail call float @llvm.nvvm.div.full(float %56, float %103), !dbg !36 + %106 = select i1 %104, float 0.000000e+00, float %105, !dbg !37 + %107 = fmul float %106, %102, !dbg !38 + %108 = fadd float %64, %107, !dbg !39 + %109 = fmul float %102, %102, !dbg !40 + %110 = fmul float %56, %109, !dbg !41 + %111 = fmul float %106, %110, !dbg !42 + %112 = fadd float %111, 0.000000e+00, !dbg !43 + %113 = fsub float %74, %108, !dbg !28 + %114 = select i1 %12, float 3.000000e+00, float 0.000000e+00, !dbg !34 + %115 = fcmp oeq float %114, 0.000000e+00, !dbg !35 + %116 = tail call float @llvm.nvvm.div.full(float %56, float %114), !dbg !36 + %117 = select i1 %115, float 0.000000e+00, float %116, !dbg !37 + %118 = fmul float %117, %113, !dbg !38 + %119 = fadd float %108, %118, !dbg !39 + %120 = fmul float %113, %113, !dbg !40 + %121 = fmul float %103, %120, !dbg !41 + %122 = fmul float %117, %121, !dbg !42 + %123 = fadd float %112, %122, !dbg !43 + %124 = fsub float %76, %119, !dbg !28 + %125 = select i1 %12, float 4.000000e+00, float 0.000000e+00, !dbg !34 + %126 = fcmp oeq float %125, 0.000000e+00, !dbg !35 + %127 = tail call float @llvm.nvvm.div.full(float %56, float %125), !dbg !36 + %128 = select i1 %126, float 0.000000e+00, float %127, !dbg !37 + %129 = fmul float %128, %124, !dbg !38 + %130 = fadd float %119, %129, !dbg !39 + %131 = fmul float %124, %124, !dbg !40 + %132 = fmul float %114, %131, !dbg !41 + %133 = fmul float %128, %132, !dbg !42 + %134 = fadd float %123, %133, !dbg !43 + %135 = fsub float %84, %130, !dbg !28 + %136 = select i1 %12, float 5.000000e+00, float 0.000000e+00, !dbg !34 + %137 = fcmp oeq float %136, 0.000000e+00, !dbg !35 + %138 = tail call float @llvm.nvvm.div.full(float %56, float %136), !dbg !36 + %139 = select i1 %137, float 0.000000e+00, float %138, !dbg !37 + %140 = fmul float %139, %135, !dbg !38 + %141 = fadd float %130, %140, !dbg !39 + %142 = fmul float %135, %135, !dbg !40 + %143 = fmul float %125, %142, !dbg !41 + %144 = fmul float %139, %143, !dbg !42 + %145 = fadd float %134, %144, !dbg !43 + %146 = fsub float %86, %141, !dbg !28 + %147 = select i1 %12, float 6.000000e+00, float 0.000000e+00, !dbg !34 + %148 = fcmp oeq float %147, 0.000000e+00, !dbg !35 + %149 = tail call float @llvm.nvvm.div.full(float %56, float %147), !dbg !36 + %150 = select i1 %148, float 0.000000e+00, float %149, !dbg !37 + %151 = fmul float %150, %146, !dbg !38 + %152 = fadd float %141, %151, !dbg !39 + %153 = fmul float %146, %146, !dbg !40 + %154 = fmul float %136, %153, !dbg !41 + %155 = fmul float %150, %154, !dbg !42 + %156 = fadd float %145, %155, !dbg !43 + %157 = fsub float %94, %152, !dbg !28 + %158 = select i1 %12, float 7.000000e+00, float 0.000000e+00, !dbg !34 + %159 = fcmp oeq float %158, 0.000000e+00, !dbg !35 + %160 = tail call float @llvm.nvvm.div.full(float %56, float %158), !dbg !36 + %161 = select i1 %159, float 0.000000e+00, float %160, !dbg !37 + %162 = fmul float %161, %157, !dbg !38 + %163 = fadd float %152, %162, !dbg !39 + %164 = fmul float %157, %157, !dbg !40 + %165 = fmul float %147, %164, !dbg !41 + %166 = fmul float %161, %165, !dbg !42 + %167 = fadd float %156, %166, !dbg !43 + %168 = fsub float %96, %163, !dbg !28 + %169 = select i1 %12, float 8.000000e+00, float 0.000000e+00, !dbg !34 + %170 = fcmp oeq float %169, 0.000000e+00, !dbg !35 + %171 = tail call float @llvm.nvvm.div.full(float %56, float %169), !dbg !36 + %172 = select i1 %170, float 0.000000e+00, float %171, !dbg !37 + %173 = fmul float %172, %168, !dbg !38 + %174 = fadd float %163, %173, !dbg !39 + %175 = fmul float %168, %168, !dbg !40 + %176 = fmul float %158, %175, !dbg !41 + %177 = fmul float %172, %176, !dbg !42 + %178 = fadd float %167, %177, !dbg !43 + %179 = bitcast float %174 to i32, !dbg !31 + %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 16, i32 31), !dbg !31 + %181 = bitcast i32 %180 to float, !dbg !31 + %182 = bitcast float %178 to i32, !dbg !31 + %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 16, i32 31), !dbg !31 + %184 = bitcast i32 %183 to float, !dbg !31 + %185 = bitcast float %169 to i32, !dbg !31 + %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 16, i32 31), !dbg !31 + %187 = bitcast i32 %186 to float, !dbg !31 + %188 = fsub float %181, %174, !dbg !28 + %189 = fadd float %169, %187, !dbg !34 + %190 = fcmp oeq float %189, 0.000000e+00, !dbg !35 + %191 = tail call float @llvm.nvvm.div.full(float %187, float %189), !dbg !36 + %192 = select i1 %190, float 0.000000e+00, float %191, !dbg !37 + %193 = fmul float %192, %188, !dbg !38 + %194 = fadd float %174, %193, !dbg !39 + %195 = fadd float %178, %184, !dbg !44 + %196 = fmul float %188, %188, !dbg !40 + %197 = fmul float %169, %196, !dbg !41 + %198 = fmul float %192, %197, !dbg !42 + %199 = fadd float %195, %198, !dbg !43 + %200 = bitcast float %194 to i32, !dbg !31 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 8, i32 31), !dbg !31 + %202 = bitcast i32 %201 to float, !dbg !31 + %203 = bitcast float %199 to i32, !dbg !31 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 8, i32 31), !dbg !31 + %205 = bitcast i32 %204 to float, !dbg !31 + %206 = bitcast float %189 to i32, !dbg !31 + %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 8, i32 31), !dbg !31 + %208 = bitcast i32 %207 to float, !dbg !31 + %209 = fsub float %202, %194, !dbg !28 + %210 = fadd float %189, %208, !dbg !34 + %211 = fcmp oeq float %210, 0.000000e+00, !dbg !35 + %212 = tail call float @llvm.nvvm.div.full(float %208, float %210), !dbg !36 + %213 = select i1 %211, float 0.000000e+00, float %212, !dbg !37 + %214 = fmul float %213, %209, !dbg !38 + %215 = fadd float %194, %214, !dbg !39 + %216 = fadd float %199, %205, !dbg !44 + %217 = fmul float %209, %209, !dbg !40 + %218 = fmul float %189, %217, !dbg !41 + %219 = fmul float %213, %218, !dbg !42 + %220 = fadd float %216, %219, !dbg !43 + %221 = bitcast float %215 to i32, !dbg !31 + %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 4, i32 31), !dbg !31 + %223 = bitcast i32 %222 to float, !dbg !31 + %224 = bitcast float %220 to i32, !dbg !31 + %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 4, i32 31), !dbg !31 + %226 = bitcast i32 %225 to float, !dbg !31 + %227 = bitcast float %210 to i32, !dbg !31 + %228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 4, i32 31), !dbg !31 + %229 = bitcast i32 %228 to float, !dbg !31 + %230 = fsub float %223, %215, !dbg !28 + %231 = fadd float %210, %229, !dbg !34 + %232 = fcmp oeq float %231, 0.000000e+00, !dbg !35 + %233 = tail call float @llvm.nvvm.div.full(float %229, float %231), !dbg !36 + %234 = select i1 %232, float 0.000000e+00, float %233, !dbg !37 + %235 = fmul float %234, %230, !dbg !38 + %236 = fadd float %215, %235, !dbg !39 + %237 = fadd float %220, %226, !dbg !44 + %238 = fmul float %230, %230, !dbg !40 + %239 = fmul float %210, %238, !dbg !41 + %240 = fmul float %234, %239, !dbg !42 + %241 = fadd float %237, %240, !dbg !43 + %242 = bitcast float %236 to i32, !dbg !31 + %243 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %242, i32 2, i32 31), !dbg !31 + %244 = bitcast i32 %243 to float, !dbg !31 + %245 = bitcast float %241 to i32, !dbg !31 + %246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 2, i32 31), !dbg !31 + %247 = bitcast i32 %246 to float, !dbg !31 + %248 = bitcast float %231 to i32, !dbg !31 + %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 2, i32 31), !dbg !31 + %250 = bitcast i32 %249 to float, !dbg !31 + %251 = fsub float %244, %236, !dbg !28 + %252 = fadd float %231, %250, !dbg !34 + %253 = fcmp oeq float %252, 0.000000e+00, !dbg !35 + %254 = tail call float @llvm.nvvm.div.full(float %250, float %252), !dbg !36 + %255 = select i1 %253, float 0.000000e+00, float %254, !dbg !37 + %256 = fmul float %255, %251, !dbg !38 + %257 = fadd float %236, %256, !dbg !39 + %258 = fadd float %241, %247, !dbg !44 + %259 = fmul float %251, %251, !dbg !40 + %260 = fmul float %231, %259, !dbg !41 + %261 = fmul float %255, %260, !dbg !42 + %262 = fadd float %258, %261, !dbg !43 + %263 = bitcast float %257 to i32, !dbg !31 + %264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 1, i32 31), !dbg !31 + %265 = bitcast i32 %264 to float, !dbg !31 + %266 = bitcast float %262 to i32, !dbg !31 + %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 1, i32 31), !dbg !31 + %268 = bitcast i32 %267 to float, !dbg !31 + %269 = bitcast float %252 to i32, !dbg !31 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 1, i32 31), !dbg !31 + %271 = bitcast i32 %270 to float, !dbg !31 + %272 = fsub float %265, %257, !dbg !28 + %273 = fadd float %252, %271, !dbg !34 + %274 = fcmp oeq float %273, 0.000000e+00, !dbg !35 + %275 = tail call float @llvm.nvvm.div.full(float %271, float %273), !dbg !36 + %276 = select i1 %274, float 0.000000e+00, float %275, !dbg !37 + %277 = fmul float %276, %272, !dbg !38 + %278 = fadd float %257, %277, !dbg !39 + %279 = fadd float %262, %268, !dbg !44 + %280 = fmul float %272, %272, !dbg !40 + %281 = fmul float %252, %280, !dbg !41 + %282 = fmul float %276, %281, !dbg !42 + %283 = fadd float %279, %282, !dbg !43 + %284 = icmp eq i32 %15, 0, !dbg !31 + %285 = getelementptr float, ptr addrspace(3) @global_smem, i32 %16, !dbg !31 + %286 = bitcast float %278 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %285, <1 x i32> %286, i1 %284) #6, !dbg !31 + %287 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %16, !dbg !31 + %288 = bitcast float %283 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %287, <1 x i32> %288, i1 %284) #6, !dbg !31 + %289 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %16, !dbg !31 + %290 = bitcast float %273 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %289, <1 x i32> %290, i1 %284) #6, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31 + %291 = icmp samesign ult i32 %14, 16, !dbg !31 + %292 = getelementptr float, ptr addrspace(3) @global_smem, i32 %14, !dbg !31 + %293 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %292, i1 %291) #6, !dbg !31 + %294 = bitcast i32 %293 to float, !dbg !31 + %295 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %14, !dbg !31 + %296 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %295, i1 %291) #6, !dbg !31 + %297 = bitcast i32 %296 to float, !dbg !31 + %298 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %14, !dbg !31 + %299 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %298, i1 %291) #6, !dbg !31 + %300 = bitcast i32 %299 to float, !dbg !31 + %301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 8, i32 31), !dbg !31 + %302 = bitcast i32 %301 to float, !dbg !31 + %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 8, i32 31), !dbg !31 + %304 = bitcast i32 %303 to float, !dbg !31 + %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 8, i32 31), !dbg !31 + %306 = bitcast i32 %305 to float, !dbg !31 + %307 = fsub float %302, %294, !dbg !28 + %308 = fadd float %300, %306, !dbg !34 + %309 = fcmp oeq float %308, 0.000000e+00, !dbg !35 + %310 = tail call float @llvm.nvvm.div.full(float %306, float %308), !dbg !36 + %311 = select i1 %309, float 0.000000e+00, float %310, !dbg !37 + %312 = fmul float %307, %311, !dbg !38 + %313 = fadd float %312, %294, !dbg !39 + %314 = fadd float %297, %304, !dbg !44 + %315 = fmul float %307, %307, !dbg !40 + %316 = fmul float %315, %300, !dbg !41 + %317 = fmul float %316, %311, !dbg !42 + %318 = fadd float %314, %317, !dbg !43 + %319 = bitcast float %313 to i32, !dbg !31 + %320 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %319, i32 4, i32 31), !dbg !31 + %321 = bitcast i32 %320 to float, !dbg !31 + %322 = bitcast float %318 to i32, !dbg !31 + %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %322, i32 4, i32 31), !dbg !31 + %324 = bitcast i32 %323 to float, !dbg !31 + %325 = bitcast float %308 to i32, !dbg !31 + %326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 4, i32 31), !dbg !31 + %327 = bitcast i32 %326 to float, !dbg !31 + %328 = fsub float %321, %313, !dbg !28 + %329 = fadd float %308, %327, !dbg !34 + %330 = fcmp oeq float %329, 0.000000e+00, !dbg !35 + %331 = tail call float @llvm.nvvm.div.full(float %327, float %329), !dbg !36 + %332 = select i1 %330, float 0.000000e+00, float %331, !dbg !37 + %333 = fmul float %328, %332, !dbg !38 + %334 = fadd float %313, %333, !dbg !39 + %335 = fadd float %318, %324, !dbg !44 + %336 = fmul float %328, %328, !dbg !40 + %337 = fmul float %308, %336, !dbg !41 + %338 = fmul float %332, %337, !dbg !42 + %339 = fadd float %335, %338, !dbg !43 + %340 = bitcast float %334 to i32, !dbg !31 + %341 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %340, i32 2, i32 31), !dbg !31 + %342 = bitcast i32 %341 to float, !dbg !31 + %343 = bitcast float %339 to i32, !dbg !31 + %344 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %343, i32 2, i32 31), !dbg !31 + %345 = bitcast i32 %344 to float, !dbg !31 + %346 = bitcast float %329 to i32, !dbg !31 + %347 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %346, i32 2, i32 31), !dbg !31 + %348 = bitcast i32 %347 to float, !dbg !31 + %349 = fsub float %342, %334, !dbg !28 + %350 = fadd float %329, %348, !dbg !34 + %351 = fcmp oeq float %350, 0.000000e+00, !dbg !35 + %352 = tail call float @llvm.nvvm.div.full(float %348, float %350), !dbg !36 + %353 = select i1 %351, float 0.000000e+00, float %352, !dbg !37 + %354 = fmul float %349, %353, !dbg !38 + %355 = fadd float %334, %354, !dbg !39 + %356 = fadd float %339, %345, !dbg !44 + %357 = fmul float %349, %349, !dbg !40 + %358 = fmul float %329, %357, !dbg !41 + %359 = fmul float %353, %358, !dbg !42 + %360 = fadd float %356, %359, !dbg !43 + %361 = bitcast float %355 to i32, !dbg !31 + %362 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %361, i32 1, i32 31), !dbg !31 + %363 = bitcast i32 %362 to float, !dbg !31 + %364 = bitcast float %360 to i32, !dbg !31 + %365 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %364, i32 1, i32 31), !dbg !31 + %366 = bitcast i32 %365 to float, !dbg !31 + %367 = bitcast float %350 to i32, !dbg !31 + %368 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %367, i32 1, i32 31), !dbg !31 + %369 = bitcast i32 %368 to float, !dbg !31 + %370 = fsub float %363, %355, !dbg !28 + %371 = fadd float %350, %369, !dbg !34 + %372 = fcmp oeq float %371, 0.000000e+00, !dbg !35 + %373 = tail call float @llvm.nvvm.div.full(float %369, float %371), !dbg !36 + %374 = select i1 %372, float 0.000000e+00, float %373, !dbg !37 + %375 = fmul float %370, %374, !dbg !38 + %376 = fadd float %355, %375, !dbg !39 + %377 = fadd float %360, %366, !dbg !44 + %378 = fmul float %370, %370, !dbg !40 + %379 = fmul float %350, %378, !dbg !41 + %380 = fmul float %374, %379, !dbg !42 + %381 = fadd float %377, %380, !dbg !43 + %382 = and i32 %13, 15, !dbg !31 + %383 = icmp eq i32 %382, 0, !dbg !31 + %384 = and i1 %291, %383, !dbg !31 + %385 = bitcast float %376 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %292, <1 x i32> %385, i1 %384) #6, !dbg !31 + %386 = bitcast float %381 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %295, <1 x i32> %386, i1 %384) #6, !dbg !31 + %387 = bitcast float %371 to <1 x i32>, !dbg !31 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %298, <1 x i32> %387, i1 %384) #6, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31 + %388 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !31 + %389 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !31 + %390 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !45 + %391 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %390, i1 %12) #6, !dbg !45 + %392 = getelementptr bfloat, ptr addrspace(1) %3, i64 %33, !dbg !46 + %393 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !47 + %394 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %392, i64 %393, i1 true) #6, !dbg !47 + %395 = getelementptr bfloat, ptr addrspace(1) %4, i64 %33, !dbg !48 + %396 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %397 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %395, i64 %396, i1 true) #6, !dbg !49 + %398 = tail call float @llvm.nvvm.div.full(float %389, float 4.096000e+03), !dbg !50 + %399 = fadd float %398, 0x3EB0C6F7A0000000, !dbg !51 + %400 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %401 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i19 = icmp eq i32 %407, 0, !dbg !52 + br i1 %.not.i19, label %410, label %408, !dbg !52 + +408: ; preds = %__nv_rsqrtf.exit + %409 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %399), !dbg !52 + br label %__nv_rsqrtf.exit21, !dbg !52 + +410: ; preds = %__nv_rsqrtf.exit + %411 = tail call float @llvm.nvvm.rsqrt.approx.f(float %399), !dbg !52 + br label %__nv_rsqrtf.exit21, !dbg !52 + +__nv_rsqrtf.exit21: ; preds = %408, %410 + %.0.i20 = phi float [ %409, %408 ], [ %411, %410 ], !dbg !52 + %412 = extractvalue { i32, i32, i32, i32 } %391, 3, !dbg !45 + %413 = bitcast i32 %412 to <2 x bfloat>, !dbg !45 + %414 = extractvalue { i32, i32, i32, i32 } %391, 2, !dbg !45 + %415 = bitcast i32 %414 to <2 x bfloat>, !dbg !45 + %416 = extractvalue { i32, i32, i32, i32 } %391, 1, !dbg !45 + %417 = bitcast i32 %416 to <2 x bfloat>, !dbg !45 + %418 = extractvalue { i32, i32, i32, i32 } %391, 0, !dbg !45 + %419 = bitcast i32 %418 to <2 x bfloat>, !dbg !45 + %420 = extractvalue { i32, i32, i32, i32 } %397, 3, !dbg !49 + %421 = bitcast i32 %420 to <2 x bfloat>, !dbg !49 + %422 = extractvalue { i32, i32, i32, i32 } %397, 2, !dbg !49 + %423 = bitcast i32 %422 to <2 x bfloat>, !dbg !49 + %424 = extractvalue { i32, i32, i32, i32 } %397, 1, !dbg !49 + %425 = bitcast i32 %424 to <2 x bfloat>, !dbg !49 + %426 = extractvalue { i32, i32, i32, i32 } %397, 0, !dbg !49 + %427 = bitcast i32 %426 to <2 x bfloat>, !dbg !49 + %428 = extractvalue { i32, i32, i32, i32 } %394, 3, !dbg !47 + %429 = bitcast i32 %428 to <2 x bfloat>, !dbg !47 + %430 = extractvalue { i32, i32, i32, i32 } %394, 2, !dbg !47 + %431 = bitcast i32 %430 to <2 x bfloat>, !dbg !47 + %432 = extractvalue { i32, i32, i32, i32 } %394, 1, !dbg !47 + %433 = bitcast i32 %432 to <2 x bfloat>, !dbg !47 + %434 = extractvalue { i32, i32, i32, i32 } %394, 0, !dbg !47 + %435 = bitcast i32 %434 to <2 x bfloat>, !dbg !47 + %436 = getelementptr bfloat, ptr addrspace(1) %6, i64 %21, !dbg !53 + %437 = fpext <2 x bfloat> %419 to <2 x float>, !dbg !54 + %438 = insertelement <2 x float> poison, float %388, i64 0, !dbg !55 + %439 = shufflevector <2 x float> %438, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !55 + %440 = fsub <2 x float> %437, %439, !dbg !55 + %441 = fpext <2 x bfloat> %427 to <2 x float>, !dbg !56 + %442 = fpext <2 x bfloat> %435 to <2 x float>, !dbg !57 + %443 = insertelement <2 x float> poison, float %.0.i20, i64 0, !dbg !58 + %444 = shufflevector <2 x float> %443, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !58 + %445 = fmul <2 x float> %440, %444, !dbg !58 + %446 = fadd <2 x float> %442, splat (float 1.000000e+00), !dbg !59 + %447 = fmul <2 x float> %446, %445, !dbg !60 + %448 = fadd <2 x float> %447, %441, !dbg !61 + %449 = fptrunc <2 x float> %448 to <2 x bfloat>, !dbg !62 + %450 = fpext <2 x bfloat> %417 to <2 x float>, !dbg !54 + %451 = fsub <2 x float> %450, %439, !dbg !55 + %452 = fpext <2 x bfloat> %425 to <2 x float>, !dbg !56 + %453 = fpext <2 x bfloat> %433 to <2 x float>, !dbg !57 + %454 = fmul <2 x float> %451, %444, !dbg !58 + %455 = fadd <2 x float> %453, splat (float 1.000000e+00), !dbg !59 + %456 = fmul <2 x float> %455, %454, !dbg !60 + %457 = fadd <2 x float> %456, %452, !dbg !61 + %458 = fptrunc <2 x float> %457 to <2 x bfloat>, !dbg !62 + %459 = fpext <2 x bfloat> %415 to <2 x float>, !dbg !54 + %460 = fsub <2 x float> %459, %439, !dbg !55 + %461 = fpext <2 x bfloat> %423 to <2 x float>, !dbg !56 + %462 = fpext <2 x bfloat> %431 to <2 x float>, !dbg !57 + %463 = fmul <2 x float> %460, %444, !dbg !58 + %464 = fadd <2 x float> %462, splat (float 1.000000e+00), !dbg !59 + %465 = fmul <2 x float> %464, %463, !dbg !60 + %466 = fadd <2 x float> %465, %461, !dbg !61 + %467 = fptrunc <2 x float> %466 to <2 x bfloat>, !dbg !62 + %468 = fpext <2 x bfloat> %413 to <2 x float>, !dbg !54 + %469 = fsub <2 x float> %468, %439, !dbg !55 + %470 = fpext <2 x bfloat> %421 to <2 x float>, !dbg !56 + %471 = fpext <2 x bfloat> %429 to <2 x float>, !dbg !57 + %472 = fmul <2 x float> %469, %444, !dbg !58 + %473 = fadd <2 x float> %471, splat (float 1.000000e+00), !dbg !59 + %474 = fmul <2 x float> %473, %472, !dbg !60 + %475 = fadd <2 x float> %474, %470, !dbg !61 + %476 = fptrunc <2 x float> %475 to <2 x bfloat>, !dbg !62 + %477 = bitcast <2 x bfloat> %449 to i32, !dbg !62 + %478 = bitcast <2 x bfloat> %458 to i32, !dbg !62 + %479 = bitcast <2 x bfloat> %467 to i32, !dbg !62 + %480 = bitcast <2 x bfloat> %476 to i32, !dbg !62 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %477, i32 %478, i32 %479, i32 %480, ptr addrspace(1) %436, i1 %12) #6, !dbg !62 + ret void, !dbg !63 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 25, column: 21, scope: !5) +!10 = !DILocation(line: 26, column: 37, scope: !5) +!11 = !DILocation(line: 38, column: 46, scope: !5) +!12 = !DILocation(line: 38, column: 41, scope: !5) +!13 = !DILocation(line: 38, column: 34, scope: !5) +!14 = !DILocation(line: 38, column: 51, scope: !5) +!15 = !DILocation(line: 39, column: 34, scope: !5) +!16 = !DILocation(line: 39, column: 41, scope: !5) +!17 = !DILocation(line: 40, column: 34, scope: !5) +!18 = !DILocation(line: 40, column: 51, scope: !5) +!19 = !DILocation(line: 50, column: 66, scope: !5) +!20 = !DILocation(line: 51, column: 29, scope: !5) +!21 = !DILocation(line: 38, column: 113, scope: !5) +!22 = !DILocation(line: 39, column: 94, scope: !5) +!23 = !DILocation(line: 40, column: 113, scope: !5) +!24 = !DILocation(line: 41, column: 22, scope: !5) +!25 = !DILocation(line: 42, column: 22, scope: !5) +!26 = !DILocation(line: 48, column: 62, scope: !5) +!27 = !DILocation(line: 51, column: 52, scope: !5) +!28 = !DILocation(line: 231, column: 21, scope: !29, inlinedAt: !31) +!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0) +!30 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime") +!31 = !DILocation(line: 243, column: 46, scope: !29, inlinedAt: !32) +!32 = !DILocation(line: 52, column: 80, scope: !33) +!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!34 = !DILocation(line: 232, column: 28, scope: !29, inlinedAt: !31) +!35 = !DILocation(line: 233, column: 39, scope: !29, inlinedAt: !31) +!36 = !DILocation(line: 233, column: 60, scope: !29, inlinedAt: !31) +!37 = !DILocation(line: 233, column: 49, scope: !29, inlinedAt: !31) +!38 = !DILocation(line: 235, column: 25, scope: !29, inlinedAt: !31) +!39 = !DILocation(line: 235, column: 17, scope: !29, inlinedAt: !31) +!40 = !DILocation(line: 236, column: 30, scope: !29, inlinedAt: !31) +!41 = !DILocation(line: 236, column: 38, scope: !29, inlinedAt: !31) +!42 = !DILocation(line: 236, column: 49, scope: !29, inlinedAt: !31) +!43 = !DILocation(line: 236, column: 22, scope: !29, inlinedAt: !31) +!44 = !DILocation(line: 236, column: 15, scope: !29, inlinedAt: !31) +!45 = !DILocation(line: 62, column: 53, scope: !5) +!46 = !DILocation(line: 63, column: 35, scope: !5) +!47 = !DILocation(line: 63, column: 42, scope: !5) +!48 = !DILocation(line: 64, column: 35, scope: !5) +!49 = !DILocation(line: 64, column: 42, scope: !5) +!50 = !DILocation(line: 68, column: 25, scope: !5) +!51 = !DILocation(line: 70, column: 24, scope: !5) +!52 = !DILocation(line: 71, column: 32, scope: !5) +!53 = !DILocation(line: 78, column: 29, scope: !5) +!54 = !DILocation(line: 62, column: 115, scope: !5) +!55 = !DILocation(line: 66, column: 24, scope: !5) +!56 = !DILocation(line: 64, column: 95, scope: !5) +!57 = !DILocation(line: 63, column: 95, scope: !5) +!58 = !DILocation(line: 72, column: 24, scope: !5) +!59 = !DILocation(line: 75, column: 24, scope: !5) +!60 = !DILocation(line: 76, column: 24, scope: !5) +!61 = !DILocation(line: 77, column: 24, scope: !5) +!62 = !DILocation(line: 78, column: 53, scope: !5) +!63 = !DILocation(line: 56, column: 4, scope: !5) diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..80f0c611b683fbf5bbc6bba4582956e310c5c76c --- /dev/null +++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ptx @@ -0,0 +1,1129 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused_add_mul_native_layer_norm_0 +.visible .entry triton_red_fused_add_mul_native_layer_norm_0( + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_7, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_10 +) +.reqntid 512 +{ + .reg .pred %p<23>; + .reg .b16 %rs<49>; + .reg .b32 %r<323>; + .reg .b64 %rd<23>; + .loc 1 18 0 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd14, [triton_red_fused_add_mul_native_layer_norm_0_param_0]; + ld.param.b64 %rd15, [triton_red_fused_add_mul_native_layer_norm_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:23:28 + mov.u32 %r49, %ctaid.x; + .loc 1 25 21 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:25:21 + setp.lt.u32 %p1, %r49, 2048; + ld.param.b64 %rd16, [triton_red_fused_add_mul_native_layer_norm_0_param_2]; + ld.param.b64 %rd17, [triton_red_fused_add_mul_native_layer_norm_0_param_3]; + .loc 1 26 37 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:26:37 + mov.u32 %r50, %tid.x; + and.b32 %r51, %r50, 511; + ld.param.b64 %rd18, [triton_red_fused_add_mul_native_layer_norm_0_param_4]; + and.b32 %r52, %r50, 31; + ld.param.b64 %rd19, [triton_red_fused_add_mul_native_layer_norm_0_param_5]; + ld.param.b64 %rd20, [triton_red_fused_add_mul_native_layer_norm_0_param_6]; + shl.b32 %r53, %r50, 3; + and.b32 %r54, %r53, 4088; + .loc 1 38 46 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:46 + shl.b32 %r55, %r49, 12; + .loc 1 38 41 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:41 + or.b32 %r56, %r54, %r55; + .loc 1 38 34 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:34 + mul.wide.s32 %rd21, %r56, 2; + add.s64 %rd1, %rd14, %rd21; + .loc 1 38 51 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:51 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + .loc 1 39 34 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:34 + mul.wide.u32 %rd22, %r54, 2; + add.s64 %rd3, %rd15, %rd22; + .loc 1 39 41 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:41 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + mov.pred %p2, -1; + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 40 34 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:34 + add.s64 %rd5, %rd16, %rd21; + .loc 1 40 51 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:51 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + mov.u32 %r13, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 50 66 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:50:66 + selp.f32 %r57, 0f3F800000, 0f00000000, %p1; + .loc 1 51 29 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:29 + add.s64 %rd7, %rd19, %rd21; + .loc 1 38 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r58, %rs1; + cvt.f32.bf16 %r59, %rs2; + .loc 1 39 94 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94 + mov.b32 {%rs3, %rs4}, %r6; + cvt.f32.bf16 %r60, %rs3; + cvt.f32.bf16 %r61, %rs4; + .loc 1 40 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113 + mov.b32 {%rs5, %rs6}, %r10; + cvt.f32.bf16 %r62, %rs5; + cvt.f32.bf16 %r63, %rs6; + .loc 1 42 22 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22 + fma.rn.f32 %r64, %r61, %r63, %r59; + fma.rn.f32 %r65, %r60, %r62, %r58; + .loc 1 48 62 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62 + selp.f32 %r66, %r65, 0f00000000, %p1; + selp.f32 %r67, %r64, 0f00000000, %p1; + .loc 1 51 52 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52 + cvt.rn.bf16x2.f32 %r14, %r64, %r65; + .loc 1 38 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113 + mov.b32 {%rs7, %rs8}, %r2; + cvt.f32.bf16 %r68, %rs7; + cvt.f32.bf16 %r69, %rs8; + .loc 1 39 94 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94 + mov.b32 {%rs9, %rs10}, %r7; + cvt.f32.bf16 %r70, %rs9; + cvt.f32.bf16 %r71, %rs10; + .loc 1 40 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113 + mov.b32 {%rs11, %rs12}, %r11; + cvt.f32.bf16 %r72, %rs11; + cvt.f32.bf16 %r73, %rs12; + .loc 1 42 22 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22 + fma.rn.f32 %r74, %r71, %r73, %r69; + fma.rn.f32 %r75, %r70, %r72, %r68; + .loc 1 48 62 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62 + selp.f32 %r76, %r75, 0f00000000, %p1; + selp.f32 %r77, %r74, 0f00000000, %p1; + .loc 1 51 52 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52 + cvt.rn.bf16x2.f32 %r15, %r74, %r75; + .loc 1 38 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113 + mov.b32 {%rs13, %rs14}, %r3; + cvt.f32.bf16 %r78, %rs13; + cvt.f32.bf16 %r79, %rs14; + .loc 1 39 94 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94 + mov.b32 {%rs15, %rs16}, %r8; + cvt.f32.bf16 %r80, %rs15; + cvt.f32.bf16 %r81, %rs16; + .loc 1 40 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113 + mov.b32 {%rs17, %rs18}, %r12; + cvt.f32.bf16 %r82, %rs17; + cvt.f32.bf16 %r83, %rs18; + .loc 1 42 22 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22 + fma.rn.f32 %r84, %r81, %r83, %r79; + fma.rn.f32 %r85, %r80, %r82, %r78; + .loc 1 48 62 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62 + selp.f32 %r86, %r85, 0f00000000, %p1; + selp.f32 %r87, %r84, 0f00000000, %p1; + .loc 1 51 52 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52 + cvt.rn.bf16x2.f32 %r16, %r84, %r85; + .loc 1 38 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113 + mov.b32 {%rs19, %rs20}, %r4; + cvt.f32.bf16 %r88, %rs19; + cvt.f32.bf16 %r89, %rs20; + .loc 1 39 94 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94 + mov.b32 {%rs21, %rs22}, %r9; + cvt.f32.bf16 %r90, %rs21; + cvt.f32.bf16 %r91, %rs22; + .loc 1 40 113 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113 + mov.b32 {%rs23, %rs24}, %r13; + cvt.f32.bf16 %r92, %rs23; + cvt.f32.bf16 %r93, %rs24; + .loc 1 42 22 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22 + fma.rn.f32 %r94, %r91, %r93, %r89; + fma.rn.f32 %r95, %r90, %r92, %r88; + .loc 1 48 62 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62 + selp.f32 %r96, %r95, 0f00000000, %p1; + selp.f32 %r97, %r94, 0f00000000, %p1; + .loc 1 51 52 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52 + cvt.rn.bf16x2.f32 %r17, %r94, %r95; + // begin inline asm + @%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r14, %r15, %r16, %r17 }; + // end inline asm +$L__tmp1: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r98, %r67, %r66; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r99, 0f40000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p6, %r99, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r100, %r57, %r99; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r101, 0f00000000, %r100, %p6; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r102, %r101, %r98, %r66; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r103, %r98, %r98; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r104, %r57, %r103; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r105, %r101, %r104, 0f00000000; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r106, %r76, %r102; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r107, 0f40400000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p7, %r107, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r108, %r57, %r107; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r109, 0f00000000, %r108, %p7; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r110, %r109, %r106, %r102; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r111, %r106, %r106; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r112, %r99, %r111; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r113, %r109, %r112, %r105; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r114, %r77, %r110; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r115, 0f40800000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p8, %r115, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r116, %r57, %r115; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r117, 0f00000000, %r116, %p8; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r118, %r117, %r114, %r110; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r119, %r114, %r114; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r120, %r107, %r119; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r121, %r117, %r120, %r113; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r122, %r86, %r118; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r123, 0f40A00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p9, %r123, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r124, %r57, %r123; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r125, 0f00000000, %r124, %p9; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r126, %r125, %r122, %r118; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r127, %r122, %r122; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r128, %r115, %r127; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r129, %r125, %r128, %r121; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r130, %r87, %r126; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r131, 0f40C00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p10, %r131, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r132, %r57, %r131; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r133, 0f00000000, %r132, %p10; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r134, %r133, %r130, %r126; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r135, %r130, %r130; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r136, %r123, %r135; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r137, %r133, %r136, %r129; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r138, %r96, %r134; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r139, 0f40E00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p11, %r139, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r140, %r57, %r139; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r141, 0f00000000, %r140, %p11; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r142, %r141, %r138, %r134; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r143, %r138, %r138; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r144, %r131, %r143; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r145, %r141, %r144, %r137; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r146, %r97, %r142; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r147, 0f41000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p12, %r147, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r148, %r57, %r147; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r149, 0f00000000, %r148, %p12; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r150, %r149, %r146, %r142; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r151, %r146, %r146; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r152, %r139, %r151; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r153, %r149, %r152, %r145; +$L__tmp2: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r154, %r150, 16, 31, -1; + shfl.sync.bfly.b32 %r155, %r153, 16, 31, -1; + shfl.sync.bfly.b32 %r156, %r147, 16, 31, -1; +$L__tmp3: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r157, %r154, %r150; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r158, %r147, %r156; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p13, %r158, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r159, %r156, %r158; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r160, 0f00000000, %r159, %p13; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r161, %r160, %r157, %r150; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r162, %r153, %r155; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r163, %r157, %r157; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r164, %r147, %r163; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r165, %r160, %r164, %r162; +$L__tmp4: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r166, %r161, 8, 31, -1; + shfl.sync.bfly.b32 %r167, %r165, 8, 31, -1; + shfl.sync.bfly.b32 %r168, %r158, 8, 31, -1; +$L__tmp5: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r169, %r166, %r161; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r170, %r158, %r168; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p14, %r170, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r171, %r168, %r170; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r172, 0f00000000, %r171, %p14; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r173, %r172, %r169, %r161; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r174, %r165, %r167; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r175, %r169, %r169; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r176, %r158, %r175; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r177, %r172, %r176, %r174; +$L__tmp6: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r178, %r173, 4, 31, -1; + shfl.sync.bfly.b32 %r179, %r177, 4, 31, -1; + shfl.sync.bfly.b32 %r180, %r170, 4, 31, -1; +$L__tmp7: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r181, %r178, %r173; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r182, %r170, %r180; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p15, %r182, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r183, %r180, %r182; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r184, 0f00000000, %r183, %p15; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r185, %r184, %r181, %r173; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r186, %r177, %r179; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r187, %r181, %r181; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r188, %r170, %r187; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r189, %r184, %r188, %r186; +$L__tmp8: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r190, %r185, 2, 31, -1; + shfl.sync.bfly.b32 %r191, %r189, 2, 31, -1; + shfl.sync.bfly.b32 %r192, %r182, 2, 31, -1; +$L__tmp9: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r193, %r190, %r185; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r194, %r182, %r192; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p16, %r194, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r195, %r192, %r194; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r196, 0f00000000, %r195, %p16; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r197, %r196, %r193, %r185; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r198, %r189, %r191; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r199, %r193, %r193; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r200, %r182, %r199; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r201, %r196, %r200, %r198; +$L__tmp10: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r202, %r197, 1, 31, -1; + shfl.sync.bfly.b32 %r203, %r201, 1, 31, -1; + shfl.sync.bfly.b32 %r204, %r194, 1, 31, -1; +$L__tmp11: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r205, %r202, %r197; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r23, %r194, %r204; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p17, %r23, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r206, %r204, %r23; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r207, 0f00000000, %r206, %p17; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r19, %r207, %r205, %r197; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r208, %r201, %r203; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r209, %r205, %r205; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r210, %r194, %r209; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r21, %r207, %r210, %r208; +$L__tmp12: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + setp.eq.b32 %p3, %r52, 0; + shr.u32 %r211, %r50, 3; + and.b32 %r212, %r211, 60; + mov.b32 %r213, global_smem; + add.s32 %r18, %r213, %r212; + // begin inline asm + @%p3 st.shared.b32 [ %r18 + 0 ], %r19; + // end inline asm + add.s32 %r20, %r18, 64; + // begin inline asm + @%p3 st.shared.b32 [ %r20 + 0 ], %r21; + // end inline asm + add.s32 %r22, %r18, 128; + // begin inline asm + @%p3 st.shared.b32 [ %r22 + 0 ], %r23; + // end inline asm + bar.sync 0; + setp.lt.u32 %p4, %r51, 16; + shl.b32 %r214, %r51, 2; + add.s32 %r25, %r213, %r214; + // begin inline asm + @%p4 ld.shared.b32 %r24, [ %r25 + 0 ]; + // end inline asm + add.s32 %r27, %r25, 64; + // begin inline asm + @%p4 ld.shared.b32 %r26, [ %r27 + 0 ]; + // end inline asm + add.s32 %r29, %r25, 128; + // begin inline asm + @%p4 ld.shared.b32 %r28, [ %r29 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r215, %r24, 8, 31, -1; + shfl.sync.bfly.b32 %r216, %r26, 8, 31, -1; + shfl.sync.bfly.b32 %r217, %r28, 8, 31, -1; +$L__tmp13: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r218, %r215, %r24; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r219, %r28, %r217; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p18, %r219, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r220, %r217, %r219; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r221, 0f00000000, %r220, %p18; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r222, %r218, %r221, %r24; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r223, %r26, %r216; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r224, %r218, %r218; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r225, %r224, %r28; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r226, %r225, %r221, %r223; +$L__tmp14: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r227, %r222, 4, 31, -1; + shfl.sync.bfly.b32 %r228, %r226, 4, 31, -1; + shfl.sync.bfly.b32 %r229, %r219, 4, 31, -1; +$L__tmp15: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r230, %r227, %r222; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r231, %r219, %r229; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p19, %r231, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r232, %r229, %r231; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r233, 0f00000000, %r232, %p19; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r234, %r230, %r233, %r222; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r235, %r226, %r228; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r236, %r230, %r230; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r237, %r219, %r236; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r238, %r233, %r237, %r235; +$L__tmp16: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r239, %r234, 2, 31, -1; + shfl.sync.bfly.b32 %r240, %r238, 2, 31, -1; + shfl.sync.bfly.b32 %r241, %r231, 2, 31, -1; +$L__tmp17: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r242, %r239, %r234; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r243, %r231, %r241; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p20, %r243, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r244, %r241, %r243; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r245, 0f00000000, %r244, %p20; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r246, %r242, %r245, %r234; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r247, %r238, %r240; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r248, %r242, %r242; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r249, %r231, %r248; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r250, %r245, %r249, %r247; +$L__tmp18: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + shfl.sync.bfly.b32 %r251, %r246, 1, 31, -1; + shfl.sync.bfly.b32 %r252, %r250, 1, 31, -1; + shfl.sync.bfly.b32 %r253, %r243, 1, 31, -1; +$L__tmp19: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + sub.f32 %r254, %r251, %r246; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r32, %r243, %r253; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + setp.eq.f32 %p21, %r32, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + div.full.f32 %r255, %r253, %r32; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + selp.f32 %r256, 0f00000000, %r255, %p21; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r30, %r254, %r256, %r246; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + add.f32 %r257, %r250, %r252; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r258, %r254, %r254; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + mul.f32 %r259, %r243, %r258; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ] + fma.rn.f32 %r31, %r256, %r259, %r257; +$L__tmp20: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] + and.b32 %r260, %r50, 15; + setp.eq.b32 %p22, %r260, 0; + and.pred %p5, %p4, %p22; + // begin inline asm + @%p5 st.shared.b32 [ %r25 + 0 ], %r30; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r27 + 0 ], %r31; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r29 + 0 ], %r32; + // end inline asm + bar.sync 0; + ld.shared.b32 %r261, [global_smem]; + ld.shared.b32 %r262, [global_smem+64]; +$L__tmp21: + .loc 1 62 53 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:53 + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r33, %r5; + mov.u32 %r34, %r5; + mov.u32 %r35, %r5; + mov.u32 %r36, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd7 + 0 ], %rd8; + // end inline asm + .loc 1 63 35 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:35 + add.s64 %rd9, %rd17, %rd22; + .loc 1 63 42 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:42 + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r37, %r5; + mov.u32 %r38, %r5; + mov.u32 %r39, %r5; + mov.u32 %r40, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r37, %r38, %r39, %r40 }, [ %rd9 + 0 ], %rd10; + // end inline asm + .loc 1 64 35 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:35 + add.s64 %rd11, %rd18, %rd22; + .loc 1 64 42 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:42 + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r41, %r5; + mov.u32 %r42, %r5; + mov.u32 %r43, %r5; + mov.u32 %r44, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd11 + 0 ], %rd12; + // end inline asm + mov.b32 %r263, 0f45800000; + .loc 1 68 25 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:68:25 + div.full.f32 %r264, %r262, %r263; + .loc 1 70 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:70:24 + add.f32 %r265, %r264, 0f358637BD; + .loc 1 71 32 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:71:32 + rsqrt.approx.ftz.f32 %r266, %r265; + .loc 1 78 29 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:29 + add.s64 %rd13, %rd20, %rd21; + .loc 1 62 115 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115 + mov.b32 {%rs25, %rs26}, %r33; + cvt.f32.bf16 %r267, %rs26; + cvt.f32.bf16 %r268, %rs25; + .loc 1 66 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24 + sub.f32 %r269, %r268, %r261; + sub.f32 %r270, %r267, %r261; + .loc 1 64 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95 + mov.b32 {%rs27, %rs28}, %r41; + cvt.f32.bf16 %r271, %rs28; + cvt.f32.bf16 %r272, %rs27; + .loc 1 63 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95 + mov.b32 {%rs29, %rs30}, %r37; + cvt.f32.bf16 %r273, %rs29; + cvt.f32.bf16 %r274, %rs30; + .loc 1 72 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24 + mul.f32 %r275, %r270, %r266; + mul.f32 %r276, %r269, %r266; + .loc 1 75 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24 + add.f32 %r277, %r274, 0f3F800000; + add.f32 %r278, %r273, 0f3F800000; + .loc 1 77 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24 + fma.rn.f32 %r279, %r278, %r276, %r272; + fma.rn.f32 %r280, %r277, %r275, %r271; + .loc 1 78 53 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53 + cvt.rn.bf16x2.f32 %r45, %r280, %r279; + .loc 1 62 115 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115 + mov.b32 {%rs31, %rs32}, %r34; + cvt.f32.bf16 %r281, %rs32; + cvt.f32.bf16 %r282, %rs31; + .loc 1 66 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24 + sub.f32 %r283, %r282, %r261; + sub.f32 %r284, %r281, %r261; + .loc 1 64 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95 + mov.b32 {%rs33, %rs34}, %r42; + cvt.f32.bf16 %r285, %rs34; + cvt.f32.bf16 %r286, %rs33; + .loc 1 63 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95 + mov.b32 {%rs35, %rs36}, %r38; + cvt.f32.bf16 %r287, %rs35; + cvt.f32.bf16 %r288, %rs36; + .loc 1 72 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24 + mul.f32 %r289, %r284, %r266; + mul.f32 %r290, %r283, %r266; + .loc 1 75 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24 + add.f32 %r291, %r288, 0f3F800000; + add.f32 %r292, %r287, 0f3F800000; + .loc 1 77 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24 + fma.rn.f32 %r293, %r292, %r290, %r286; + fma.rn.f32 %r294, %r291, %r289, %r285; + .loc 1 78 53 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53 + cvt.rn.bf16x2.f32 %r46, %r294, %r293; + .loc 1 62 115 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115 + mov.b32 {%rs37, %rs38}, %r35; + cvt.f32.bf16 %r295, %rs38; + cvt.f32.bf16 %r296, %rs37; + .loc 1 66 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24 + sub.f32 %r297, %r296, %r261; + sub.f32 %r298, %r295, %r261; + .loc 1 64 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95 + mov.b32 {%rs39, %rs40}, %r43; + cvt.f32.bf16 %r299, %rs40; + cvt.f32.bf16 %r300, %rs39; + .loc 1 63 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95 + mov.b32 {%rs41, %rs42}, %r39; + cvt.f32.bf16 %r301, %rs41; + cvt.f32.bf16 %r302, %rs42; + .loc 1 72 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24 + mul.f32 %r303, %r298, %r266; + mul.f32 %r304, %r297, %r266; + .loc 1 75 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24 + add.f32 %r305, %r302, 0f3F800000; + add.f32 %r306, %r301, 0f3F800000; + .loc 1 77 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24 + fma.rn.f32 %r307, %r306, %r304, %r300; + fma.rn.f32 %r308, %r305, %r303, %r299; + .loc 1 78 53 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53 + cvt.rn.bf16x2.f32 %r47, %r308, %r307; + .loc 1 62 115 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115 + mov.b32 {%rs43, %rs44}, %r36; + cvt.f32.bf16 %r309, %rs44; + cvt.f32.bf16 %r310, %rs43; + .loc 1 66 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24 + sub.f32 %r311, %r310, %r261; + sub.f32 %r312, %r309, %r261; + .loc 1 64 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95 + mov.b32 {%rs45, %rs46}, %r44; + cvt.f32.bf16 %r313, %rs46; + cvt.f32.bf16 %r314, %rs45; + .loc 1 63 95 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95 + mov.b32 {%rs47, %rs48}, %r40; + cvt.f32.bf16 %r315, %rs47; + cvt.f32.bf16 %r316, %rs48; + .loc 1 72 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24 + mul.f32 %r317, %r312, %r266; + mul.f32 %r318, %r311, %r266; + .loc 1 75 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24 + add.f32 %r319, %r316, 0f3F800000; + add.f32 %r320, %r315, 0f3F800000; + .loc 1 77 24 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24 + fma.rn.f32 %r321, %r320, %r318, %r314; + fma.rn.f32 %r322, %r319, %r317, %r313; + .loc 1 78 53 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53 + cvt.rn.bf16x2.f32 %r48, %r322, %r321; + // begin inline asm + @%p1 st.global.v4.b32 [ %rd13 + 0 ], { %r45, %r46, %r47, %r48 }; + // end inline asm + .loc 1 56 4 // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:56:4 + ret; +$L__tmp22: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 343 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x150 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 51 +.b8 106 +.b8 98 +.b8 105 +.b8 121 +.b8 53 +.b8 122 +.b8 114 +.b8 107 +.b8 121 +.b8 109 +.b8 55 +.b8 118 +.b8 107 +.b8 110 +.b8 110 +.b8 51 +.b8 122 +.b8 105 +.b8 117 +.b8 107 +.b8 51 +.b8 113 +.b8 105 +.b8 109 +.b8 108 +.b8 98 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 50 +.b8 98 +.b8 98 +.b8 122 +.b8 51 +.b8 115 +.b8 117 +.b8 102 +.b8 54 +.b8 113 +.b8 120 +.b8 105 +.b8 106 +.b8 110 +.b8 98 +.b8 102 +.b8 99 +.b8 51 +.b8 121 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 119 +.b8 51 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 97 +.b8 116 +.b8 105 +.b8 118 +.b8 101 +.b8 95 +.b8 108 +.b8 97 +.b8 121 +.b8 101 +.b8 114 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x113:0x47 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x128:0x31 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp21 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 80 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x140:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp20 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 243 // DW_AT_call_line +.b8 46 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.source new file mode 100644 index 0000000000000000000000000000000000000000..003e79d3962a0d73b491298810f1598181bcc406 --- /dev/null +++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.source @@ -0,0 +1,486 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0) +#loc88 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0) +#loc101 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0) +#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0) +#loc107 = loc(unknown) +#loc110 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0) +#loc125 = loc("in_ptr0"(#loc)) +#loc126 = loc("in_ptr1"(#loc)) +#loc127 = loc("in_ptr2"(#loc)) +#loc128 = loc("in_ptr3"(#loc)) +#loc129 = loc("in_ptr4"(#loc)) +#loc130 = loc("out_ptr0"(#loc)) +#loc131 = loc("out_ptr3"(#loc)) +#loc132 = loc("xnumel"(#loc)) +#loc133 = loc("r0_numel"(#loc)) +#loc201 = loc("value"(#loc88)) +#loc202 = loc("mean"(#loc88)) +#loc203 = loc("m2"(#loc88)) +#loc204 = loc("weight"(#loc88)) +#loc205 = loc("first_iteration"(#loc88)) +#loc215 = loc("input"(#loc101)) +#loc216 = loc("mean"(#loc105)) +#loc217 = loc("m2"(#loc105)) +#loc218 = loc("weight"(#loc105)) +#loc219 = loc("mean_1"(#loc110)) +#loc220 = loc("m2_1"(#loc110)) +#loc221 = loc("weight_1"(#loc110)) +#loc222 = loc("mean_2"(#loc110)) +#loc223 = loc("m2_2"(#loc110)) +#loc224 = loc("weight_2"(#loc110)) +#loc231 = loc("new_mean"(#loc201)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2048 : i32 loc(#loc134) + %r0_numel_1 = arith.constant 4096 : i32 loc(#loc135) + %xoffset = tt.get_program_id x : i32 loc(#loc136) + %xoffset_2 = arith.constant 1 : i32 loc(#loc137) + %xoffset_3 = arith.constant 1 : i32 loc(#loc137) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc137) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc138) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc139) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc140) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc140) + %xmask = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc141) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc141) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc142) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc143) + %tmp7_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc144) + %tmp7_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc145) + %tmp7_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc146) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c4096_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %tmp7_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp7_mean_13 = %tmp7_mean, %tmp7_m2_14 = %tmp7_m2, %tmp7_weight_15 = %tmp7_weight) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc148) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc148) + %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc149) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x4096xi32> loc(#loc149) + %tmp0 = arith.constant 4096 : i32 loc(#loc150) + %tmp0_18 = arith.constant 4096 : i32 loc(#loc150) + %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc150) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc150) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc151) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x4096xi32> loc(#loc151) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc152) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc152) + %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc153) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x4096xi1> loc(#loc153) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc154) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc154) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc154) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc154) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc155) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc156) + %tmp1_32 = tt.addptr %tmp1, %r0_index_16 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc156) + %tmp1_33 = arith.constant 0.000000e+00 : f32 loc(#loc157) + %tmp1_34 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc157) + %tmp1_35 = arith.truncf %tmp1_34 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc157) + %tmp1_36 = tt.load %tmp1_32, %r0_mask_17, %tmp1_35 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc157) + %tmp1_37 = arith.extf %tmp1_36 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc158) + %tmp2 = arith.constant 4096 : i32 loc(#loc159) + %tmp2_38 = arith.constant 4096 : i32 loc(#loc159) + %tmp2_39 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc159) + %tmp2_40 = arith.muli %tmp2_39, %xindex_7 : tensor<1x1xi32> loc(#loc159) + %tmp2_41 = tt.broadcast %tmp2_40 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc160) + %tmp2_42 = arith.addi %r0_index_16, %tmp2_41 : tensor<1x4096xi32> loc(#loc160) + %tmp2_43 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc161) + %tmp2_44 = tt.addptr %tmp2_43, %tmp2_42 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc161) + %tmp2_45 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc162) + %tmp2_46 = arith.andi %r0_mask_17, %tmp2_45 : tensor<1x4096xi1> loc(#loc162) + %tmp2_47 = arith.constant 0.000000e+00 : f32 loc(#loc163) + %tmp2_48 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc163) + %tmp2_49 = arith.truncf %tmp2_48 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc163) + %tmp2_50 = tt.load %tmp2_44, %tmp2_46, %tmp2_49 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc163) + %tmp2_51 = arith.extf %tmp2_50 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc164) + %tmp3 = arith.mulf %tmp1_37, %tmp2_51 : tensor<1x4096xf32> loc(#loc165) + %tmp4 = arith.addf %tmp0_31, %tmp3 : tensor<1x4096xf32> loc(#loc166) + %c0_i32_52 = arith.constant 0 : i32 loc(#loc34) + %9 = arith.cmpi eq, %r0_offset, %c0_i32_52 : i32 loc(#loc34) + %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%tmp4, %tmp7_mean_13, %tmp7_m2_14, %tmp7_weight_15, %9) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, i1) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) loc(#loc35) + %tmp7_mean_53 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc167) + %tmp7_mean_54 = arith.andi %r0_mask_17, %tmp7_mean_53 : tensor<1x4096xi1> loc(#loc167) + %tmp7_mean_55 = arith.select %tmp7_mean_54, %10#0, %tmp7_mean_13 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc168) + %tmp7_m2_56 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc169) + %tmp7_m2_57 = arith.andi %r0_mask_17, %tmp7_m2_56 : tensor<1x4096xi1> loc(#loc169) + %tmp7_m2_58 = arith.select %tmp7_m2_57, %10#1, %tmp7_m2_14 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc170) + %tmp7_weight_59 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc171) + %tmp7_weight_60 = arith.andi %r0_mask_17, %tmp7_weight_59 : tensor<1x4096xi1> loc(#loc171) + %tmp7_weight_61 = arith.select %tmp7_weight_60, %10#2, %tmp7_weight_15 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc172) + %c4096_i32_62 = arith.constant 4096 : i32 loc(#loc42) + %c4096_i32_63 = arith.constant 4096 : i32 loc(#loc42) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc42) + %11 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc42) + %12 = tt.broadcast %11 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc43) + %13 = arith.addi %r0_index_16, %12 : tensor<1x4096xi32> loc(#loc43) + %14 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc44) + %15 = tt.addptr %14, %13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc44) + %16 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc45) + %17 = arith.andi %r0_mask_17, %16 : tensor<1x4096xi1> loc(#loc45) + %18 = arith.truncf %tmp4 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc46) + tt.store %15, %18, %17 : tensor<1x4096x!tt.ptr> loc(#loc46) + scf.yield %tmp7_mean_55, %tmp7_m2_58, %tmp7_weight_61 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc47) + } loc(#loc237) + %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%tmp7_weight_10#0, %tmp7_weight_10#1, %tmp7_weight_10#2) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc48) + %tmp7 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc173) + %tmp11 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc174) + %tmp12 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc175) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc52) + %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc52) + %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc52) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc52) + %7 = arith.bitcast %c4096_i32_12 : i32 to i32 loc(#loc52) + %8 = ub.poison : i32 loc(#loc52) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc176) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc176) + %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc177) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x4096xi32> loc(#loc177) + %tmp13 = arith.constant 4096 : i32 loc(#loc178) + %tmp13_15 = arith.constant 4096 : i32 loc(#loc178) + %tmp13_16 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc178) + %tmp13_17 = arith.muli %tmp13_16, %xindex_7 : tensor<1x1xi32> loc(#loc178) + %tmp13_18 = tt.broadcast %tmp13_17 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc179) + %tmp13_19 = arith.addi %r0_index_13, %tmp13_18 : tensor<1x4096xi32> loc(#loc179) + %tmp13_20 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc180) + %tmp13_21 = tt.addptr %tmp13_20, %tmp13_19 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc180) + %tmp13_22 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc181) + %tmp13_23 = arith.andi %r0_mask_14, %tmp13_22 : tensor<1x4096xi1> loc(#loc181) + %tmp13_24 = arith.constant 0.000000e+00 : f32 loc(#loc182) + %tmp13_25 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc182) + %tmp13_26 = arith.truncf %tmp13_25 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc182) + %tmp13_27 = tt.load %tmp13_21, %tmp13_23, %tmp13_26 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc182) + %tmp13_28 = arith.extf %tmp13_27 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc183) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc184) + %tmp23_29 = tt.addptr %tmp23, %r0_index_13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc184) + %tmp23_30 = arith.constant 0.000000e+00 : f32 loc(#loc185) + %tmp23_31 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc185) + %tmp23_32 = arith.truncf %tmp23_31 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc185) + %tmp23_33 = tt.load %tmp23_29, %r0_mask_14, %tmp23_32 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc185) + %tmp23_34 = arith.extf %tmp23_33 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc186) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc187) + %tmp27_35 = tt.addptr %tmp27, %r0_index_13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc187) + %tmp27_36 = arith.constant 0.000000e+00 : f32 loc(#loc188) + %tmp27_37 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc188) + %tmp27_38 = arith.truncf %tmp27_37 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc188) + %tmp27_39 = tt.load %tmp27_35, %r0_mask_14, %tmp27_38 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc188) + %tmp27_40 = arith.extf %tmp27_39 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc189) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc190) + %tmp15_41 = arith.subf %tmp13_28, %tmp15 : tensor<1x4096xf32> loc(#loc190) + %tmp16 = arith.constant 4.096000e+03 : f32 loc(#loc191) + %tmp17 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc192) + %tmp17_42 = arith.divf %tmp11, %tmp17 : tensor<1x1xf32> loc(#loc192) + %tmp18 = arith.constant 9.99999997E-7 : f32 loc(#loc193) + %tmp19 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc194) + %tmp19_43 = arith.addf %tmp17_42, %tmp19 : tensor<1x1xf32> loc(#loc194) + %tmp20 = tt.extern_elementwise %tmp19_43 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc195) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc196) + %tmp21_44 = arith.mulf %tmp15_41, %tmp21 : tensor<1x4096xf32> loc(#loc196) + %tmp24 = arith.constant 1.000000e+00 : f32 loc(#loc197) + %tmp25 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc198) + %tmp25_45 = arith.addf %tmp23_34, %tmp25 : tensor<1x4096xf32> loc(#loc198) + %tmp26 = arith.mulf %tmp21_44, %tmp25_45 : tensor<1x4096xf32> loc(#loc199) + %tmp28 = arith.addf %tmp26, %tmp27_40 : tensor<1x4096xf32> loc(#loc200) + %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc78) + %c4096_i32_47 = arith.constant 4096 : i32 loc(#loc78) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc78) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc78) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc79) + %11 = arith.addi %r0_index_13, %10 : tensor<1x4096xi32> loc(#loc79) + %12 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc80) + %13 = tt.addptr %12, %11 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc80) + %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc81) + %15 = arith.andi %r0_mask_14, %14 : tensor<1x4096xi1> loc(#loc81) + %16 = arith.truncf %tmp28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc82) + tt.store %13, %16, %15 : tensor<1x4096x!tt.ptr> loc(#loc82) + } loc(#loc52) + tt.return loc(#loc83) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() -> tensor<1x4096xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc85) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc85) + tt.return %cst_0 : tensor<1x4096xf32> loc(#loc86) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x4096xf32> loc(#loc87) + tt.return %0 : tensor<1x4096xf32> loc(#loc87) + } loc(#loc84) + tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%new_mean: tensor<1x4096xf32> loc("new_mean"(#loc201)), %mean: tensor<1x4096xf32> loc("mean"(#loc88)), %m2: tensor<1x4096xf32> loc("m2"(#loc88)), %weight: tensor<1x4096xf32> loc("weight"(#loc88)), %first_iteration: i1 loc("first_iteration"(#loc88))) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) attributes {noinline = false} { + %0:3 = scf.if %first_iteration -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) { + %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc206) + %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc232) + %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_4096S__(%m2) : (tensor<1x4096xf32>) -> tensor<1x4096xf32> loc(#loc233) + scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc233) + } else { + %delta = arith.subf %new_mean, %mean : tensor<1x4096xf32> loc(#loc208) + %new_weight = arith.constant 1 : i32 loc(#loc209) + %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc209) + %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc209) + %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x4096xf32> loc(#loc234) + %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x4096xf32> loc(#loc210) + %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x4096xf32> loc(#loc235) + %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x4096xf32> loc(#loc212) + %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x4096xf32> loc(#loc213) + %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x4096xf32> loc(#loc236) + scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc214) + } loc(#loc89) + tt.return %0#1, %0#0, %0#2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc99) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x4096xf32> loc(#loc100) + %2 = ub.poison : tensor<1x4096xf32> loc(#loc100) + %3 = ub.poison : tensor<1x4096xf32> loc(#loc100) + tt.return %1, %2, %3 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc100) + } loc(#loc88) + tt.func private @triton.language.standard.zeros_like__fp32S1_4096S__(%input: tensor<1x4096xf32> loc("input"(#loc101))) -> tensor<1x4096xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc102) + tt.return %0 : tensor<1x4096xf32> loc(#loc103) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x4096xf32> loc(#loc104) + tt.return %1 : tensor<1x4096xf32> loc(#loc104) + } loc(#loc101) + tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%mean: tensor<1x4096xf32> loc("mean"(#loc105)), %m2: tensor<1x4096xf32> loc("m2"(#loc105)), %weight: tensor<1x4096xf32> loc("weight"(#loc105))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({ + ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): + %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc106) + tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc106) + }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc106) + tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc108) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc109) + %2 = ub.poison : tensor<1xf32> loc(#loc109) + %3 = ub.poison : tensor<1xf32> loc(#loc109) + tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc109) + } loc(#loc105) + tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc110)), %m2_1: f32 loc("m2_1"(#loc110)), %weight_1: f32 loc("weight_1"(#loc110)), %mean_2: f32 loc("mean_2"(#loc110)), %m2_2: f32 loc("m2_2"(#loc110)), %weight_2: f32 loc("weight_2"(#loc110))) -> (f32, f32, f32) attributes {noinline = false} { + %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc225) + %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc226) + %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc227) + %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc227) + %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc228) + %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc229) + %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc229) + %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc229) + %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc116) + %1 = arith.addf %mean_1, %0 : f32 loc(#loc117) + %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc118) + %3 = arith.mulf %delta, %delta : f32 loc(#loc119) + %4 = arith.mulf %3, %weight_1 : f32 loc(#loc120) + %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc121) + %6 = arith.addf %2, %5 : f32 loc(#loc122) + tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc123) + ^bb1: // no predecessors + %7 = ub.poison : f32 loc(#loc124) + %8 = ub.poison : f32 loc(#loc124) + %9 = ub.poison : f32 loc(#loc124) + tt.return %7, %8, %9 : f32, f32, f32 loc(#loc124) + } loc(#loc110) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":29:45) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":30:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":31:47) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:46) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:61) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:62) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:51) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:39) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:37) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:58) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:41) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:41) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:36) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:62) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:8) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":55:18) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:43) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":57:31) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":58:29) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:48) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:43) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:36) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:63) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":67:16) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":69:16) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":74:16) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:41) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:36) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:63) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4) +#loc84 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0) +#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31) +#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11) +#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4) +#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46) +#loc91 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11) +#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4) +#loc102 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30) +#loc103 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11) +#loc104 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4) +#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11) +#loc109 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4) +#loc111 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc112 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc113 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc114 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc115 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc116 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc117 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc118 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc119 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc120 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc121 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc122 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc123 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11) +#loc124 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4) +#loc134 = loc("xnumel"(#loc1)) +#loc135 = loc("r0_numel"(#loc2)) +#loc136 = loc("xoffset"(#loc3)) +#loc137 = loc("xoffset"(#loc4)) +#loc138 = loc("xindex"(#loc5)) +#loc139 = loc("xindex"(#loc6)) +#loc140 = loc("xindex"(#loc7)) +#loc141 = loc("xmask"(#loc8)) +#loc142 = loc("r0_base"(#loc9)) +#loc143 = loc("r0_base"(#loc10)) +#loc144 = loc("tmp7_mean"(#loc11)) +#loc145 = loc("tmp7_m2"(#loc12)) +#loc146 = loc("tmp7_weight"(#loc13)) +#loc147 = loc("tmp7_mean"(#loc14)) +#loc148 = loc("r0_index"(#loc15)) +#loc149 = loc("r0_mask"(#loc16)) +#loc150 = loc("tmp0"(#loc17)) +#loc151 = loc("tmp0"(#loc18)) +#loc152 = loc("tmp0"(#loc19)) +#loc153 = loc("tmp0"(#loc20)) +#loc154 = loc("tmp0"(#loc21)) +#loc155 = loc("tmp0"(#loc22)) +#loc156 = loc("tmp1"(#loc23)) +#loc157 = loc("tmp1"(#loc24)) +#loc158 = loc("tmp1"(#loc25)) +#loc159 = loc("tmp2"(#loc26)) +#loc160 = loc("tmp2"(#loc27)) +#loc161 = loc("tmp2"(#loc28)) +#loc162 = loc("tmp2"(#loc29)) +#loc163 = loc("tmp2"(#loc30)) +#loc164 = loc("tmp2"(#loc31)) +#loc165 = loc("tmp3"(#loc32)) +#loc166 = loc("tmp4"(#loc33)) +#loc167 = loc("tmp7_mean"(#loc36)) +#loc168 = loc("tmp7_mean"(#loc37)) +#loc169 = loc("tmp7_m2"(#loc38)) +#loc170 = loc("tmp7_m2"(#loc39)) +#loc171 = loc("tmp7_weight"(#loc40)) +#loc172 = loc("tmp7_weight"(#loc41)) +#loc173 = loc("tmp7"(#loc49)) +#loc174 = loc("tmp11"(#loc50)) +#loc175 = loc("tmp12"(#loc51)) +#loc176 = loc("r0_index"(#loc53)) +#loc177 = loc("r0_mask"(#loc54)) +#loc178 = loc("tmp13"(#loc55)) +#loc179 = loc("tmp13"(#loc56)) +#loc180 = loc("tmp13"(#loc57)) +#loc181 = loc("tmp13"(#loc58)) +#loc182 = loc("tmp13"(#loc59)) +#loc183 = loc("tmp13"(#loc60)) +#loc184 = loc("tmp23"(#loc61)) +#loc185 = loc("tmp23"(#loc62)) +#loc186 = loc("tmp23"(#loc63)) +#loc187 = loc("tmp27"(#loc64)) +#loc188 = loc("tmp27"(#loc65)) +#loc189 = loc("tmp27"(#loc66)) +#loc190 = loc("tmp15"(#loc67)) +#loc191 = loc("tmp16"(#loc68)) +#loc192 = loc("tmp17"(#loc69)) +#loc193 = loc("tmp18"(#loc70)) +#loc194 = loc("tmp19"(#loc71)) +#loc195 = loc("tmp20"(#loc72)) +#loc196 = loc("tmp21"(#loc73)) +#loc197 = loc("tmp24"(#loc74)) +#loc198 = loc("tmp25"(#loc75)) +#loc199 = loc("tmp26"(#loc76)) +#loc200 = loc("tmp28"(#loc77)) +#loc206 = loc("new_weight"(#loc90)) +#loc207 = loc("new_m2"(#loc91)) +#loc208 = loc("delta"(#loc92)) +#loc209 = loc("new_weight"(#loc93)) +#loc210 = loc("new_mean"(#loc94)) +#loc211 = loc("new_mean"(#loc95)) +#loc212 = loc("new_m2"(#loc96)) +#loc213 = loc("new_m2"(#loc97)) +#loc214 = loc("new_m2"(#loc98)) +#loc225 = loc("delta"(#loc111)) +#loc226 = loc("new_weight"(#loc112)) +#loc227 = loc("w2_over_w"(#loc113)) +#loc228 = loc("w2_over_w"(#loc114)) +#loc229 = loc("w2_over_w"(#loc115)) +#loc230 = loc("tmp7_m2"(#loc147)) +#loc232 = loc("new_weight"(#loc206)) +#loc233 = loc("new_m2"(#loc207)) +#loc234 = loc("new_weight"(#loc209)) +#loc235 = loc("new_mean"(#loc211)) +#loc236 = loc("new_m2"(#loc214)) +#loc237 = loc("tmp7_weight"(#loc230)) diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..c5c42efca1ff3bc8bd7058afe91a8602915ef9c3 --- /dev/null +++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttgir @@ -0,0 +1,214 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0) +#loc1 = loc(unknown) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("in_ptr1"(#loc)) +#loc61 = loc("in_ptr2"(#loc)) +#loc62 = loc("in_ptr3"(#loc)) +#loc63 = loc("in_ptr4"(#loc)) +#loc64 = loc("out_ptr0"(#loc)) +#loc65 = loc("out_ptr3"(#loc)) +#loc66 = loc("xnumel"(#loc)) +#loc67 = loc("r0_numel"(#loc)) +#loc89 = loc(callsite(#loc1 at #loc25)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1x4096xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16, #blocked> loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc68) + %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc69) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc70) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4096xi32, #blocked> loc(#loc70) + %r0_mask = arith.cmpi slt, %r0_base_6, %cst : tensor<1x4096xi32, #blocked> loc(#loc71) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc72) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32, #blocked> loc(#loc113) + %tmp0_8 = arith.addi %r0_base_6, %tmp0_7 : tensor<1x4096xi32, #blocked> loc(#loc73) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc74) + %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc74) + %tmp0_11 = tt.splat %xmask : i1 -> tensor<1x4096xi1, #blocked> loc(#loc114) + %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x4096xi1, #blocked> loc(#loc75) + %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr, #blocked> loc(#loc76) + %tmp0_14 = arith.extf %tmp0_13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc77) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc78) + %tmp1_15 = tt.addptr %tmp1, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc78) + %tmp1_16 = tt.load %tmp1_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc79) + %tmp1_17 = arith.extf %tmp1_16 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc80) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc81) + %tmp2_18 = tt.addptr %tmp2, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc81) + %tmp2_19 = tt.load %tmp2_18, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr, #blocked> loc(#loc82) + %tmp2_20 = arith.extf %tmp2_19 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc83) + %tmp3 = arith.mulf %tmp1_17, %tmp2_20 : tensor<1x4096xf32, #blocked> loc(#loc84) + %tmp4 = arith.addf %tmp0_14, %tmp3 : tensor<1x4096xf32, #blocked> loc(#loc85) + %tmp7_mean = arith.select %tmp0_12, %tmp4, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc86) + %tmp7_weight = arith.select %tmp0_12, %cst_3, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc87) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc22) + %1 = tt.addptr %0, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc22) + %2 = arith.truncf %tmp4 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc23) + tt.store %1, %2, %tmp0_12 : tensor<1x4096x!tt.ptr, #blocked> loc(#loc23) + %3:3 = "tt.reduce"(%tmp7_mean, %cst_2, %tmp7_weight) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc25)), %arg10: f32 loc(callsite(#loc1 at #loc25)), %arg11: f32 loc(callsite(#loc1 at #loc25)), %arg12: f32 loc(callsite(#loc1 at #loc25)), %arg13: f32 loc(callsite(#loc1 at #loc25)), %arg14: f32 loc(callsite(#loc1 at #loc25))): + %delta = arith.subf %arg12, %arg9 : f32 loc(#loc115) + %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc116) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc117) + %w2_over_w_30 = arith.divf %arg14, %new_weight : f32 loc(#loc118) + %w2_over_w_31 = arith.select %w2_over_w, %cst_1, %w2_over_w_30 : f32 loc(#loc119) + %7 = arith.mulf %delta, %w2_over_w_31 : f32 loc(#loc120) + %8 = arith.addf %arg9, %7 : f32 loc(#loc121) + %9 = arith.addf %arg10, %arg13 : f32 loc(#loc122) + %10 = arith.mulf %delta, %delta : f32 loc(#loc123) + %11 = arith.mulf %10, %arg11 : f32 loc(#loc124) + %12 = arith.mulf %11, %w2_over_w_31 : f32 loc(#loc125) + %13 = arith.addf %9, %12 : f32 loc(#loc126) + tt.reduce.return %8, %13, %new_weight : f32, f32, f32 loc(#loc88) + }) : (tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc88) + %tmp7 = tt.expand_dims %3#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc95) + %tmp11 = tt.expand_dims %3#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc96) + %tmp13 = tt.load %1, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr, #blocked> loc(#loc97) + %tmp13_21 = arith.extf %tmp13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc98) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc99) + %tmp23_22 = tt.addptr %tmp23, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc99) + %tmp23_23 = tt.load %tmp23_22, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc100) + %tmp23_24 = arith.extf %tmp23_23 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc101) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc102) + %tmp27_25 = tt.addptr %tmp27, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc102) + %tmp27_26 = tt.load %tmp27_25, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc103) + %tmp27_27 = arith.extf %tmp27_26 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc104) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc105) + %tmp15_28 = arith.subf %tmp13_21, %tmp15 : tensor<1x4096xf32, #blocked> loc(#loc105) + %tmp17 = arith.divf %tmp11, %cst_5 : tensor<1x1xf32, #blocked> loc(#loc106) + %tmp19 = arith.addf %tmp17, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc107) + %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc108) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc109) + %tmp21_29 = arith.mulf %tmp15_28, %tmp21 : tensor<1x4096xf32, #blocked> loc(#loc109) + %tmp25 = arith.addf %tmp23_24, %cst_3 : tensor<1x4096xf32, #blocked> loc(#loc110) + %tmp26 = arith.mulf %tmp21_29, %tmp25 : tensor<1x4096xf32, #blocked> loc(#loc111) + %tmp28 = arith.addf %tmp26, %tmp27_27 : tensor<1x4096xf32, #blocked> loc(#loc112) + %4 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc56) + %5 = tt.addptr %4, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc56) + %6 = arith.truncf %tmp28 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc57) + tt.store %5, %6, %tmp0_12 : tensor<1x4096x!tt.ptr, #blocked> loc(#loc57) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4) +#loc68 = loc("xoffset"(#loc2)) +#loc69 = loc("xmask"(#loc3)) +#loc70 = loc("r0_base"(#loc4)) +#loc71 = loc("r0_mask"(#loc5)) +#loc72 = loc("tmp0"(#loc6)) +#loc73 = loc("tmp0"(#loc7)) +#loc74 = loc("tmp0"(#loc8)) +#loc75 = loc("tmp0"(#loc9)) +#loc76 = loc("tmp0"(#loc10)) +#loc77 = loc("tmp0"(#loc11)) +#loc78 = loc("tmp1"(#loc12)) +#loc79 = loc("tmp1"(#loc13)) +#loc80 = loc("tmp1"(#loc14)) +#loc81 = loc("tmp2"(#loc15)) +#loc82 = loc("tmp2"(#loc16)) +#loc83 = loc("tmp2"(#loc17)) +#loc84 = loc("tmp3"(#loc18)) +#loc85 = loc("tmp4"(#loc19)) +#loc86 = loc("tmp7_mean"(#loc20)) +#loc87 = loc("tmp7_weight"(#loc21)) +#loc88 = loc(callsite(#loc24 at #loc25)) +#loc90 = loc("delta"(#loc26)) +#loc91 = loc("new_weight"(#loc27)) +#loc92 = loc("w2_over_w"(#loc28)) +#loc93 = loc("w2_over_w"(#loc29)) +#loc94 = loc("w2_over_w"(#loc30)) +#loc95 = loc("tmp7"(#loc38)) +#loc96 = loc("tmp11"(#loc39)) +#loc97 = loc("tmp13"(#loc40)) +#loc98 = loc("tmp13"(#loc41)) +#loc99 = loc("tmp23"(#loc42)) +#loc100 = loc("tmp23"(#loc43)) +#loc101 = loc("tmp23"(#loc44)) +#loc102 = loc("tmp27"(#loc45)) +#loc103 = loc("tmp27"(#loc46)) +#loc104 = loc("tmp27"(#loc47)) +#loc105 = loc("tmp15"(#loc48)) +#loc106 = loc("tmp17"(#loc49)) +#loc107 = loc("tmp19"(#loc50)) +#loc108 = loc("tmp20"(#loc51)) +#loc109 = loc("tmp21"(#loc52)) +#loc110 = loc("tmp25"(#loc53)) +#loc111 = loc("tmp26"(#loc54)) +#loc112 = loc("tmp28"(#loc55)) +#loc113 = loc(fused[#loc73, #loc72]) +#loc114 = loc(fused[#loc75, #loc69]) +#loc115 = loc(callsite(#loc90 at #loc88)) +#loc116 = loc(callsite(#loc91 at #loc88)) +#loc117 = loc(callsite(#loc92 at #loc88)) +#loc118 = loc(callsite(#loc93 at #loc88)) +#loc119 = loc(callsite(#loc94 at #loc88)) +#loc120 = loc(callsite(#loc31 at #loc88)) +#loc121 = loc(callsite(#loc32 at #loc88)) +#loc122 = loc(callsite(#loc33 at #loc88)) +#loc123 = loc(callsite(#loc34 at #loc88)) +#loc124 = loc(callsite(#loc35 at #loc88)) +#loc125 = loc(callsite(#loc36 at #loc88)) +#loc126 = loc(callsite(#loc37 at #loc88)) diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a893ef79d8c04071ca6424f9df1c39a003b77ac0 --- /dev/null +++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttir @@ -0,0 +1,215 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0) +#loc1 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80) +#loc60 = loc("in_ptr0"(#loc)) +#loc61 = loc("in_ptr1"(#loc)) +#loc62 = loc("in_ptr2"(#loc)) +#loc63 = loc("in_ptr3"(#loc)) +#loc64 = loc("in_ptr4"(#loc)) +#loc65 = loc("out_ptr0"(#loc)) +#loc66 = loc("out_ptr3"(#loc)) +#loc67 = loc("xnumel"(#loc)) +#loc68 = loc("r0_numel"(#loc)) +#loc70 = loc(callsite(#loc1 at #loc3)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %xmask = arith.constant 2048 : i32 loc(#loc69) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc70) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16> loc(#loc1) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc1) + %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1) + %cst_5 = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc71) + %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc69) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc72) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc73) + %r0_mask = arith.cmpi slt, %r0_base_7, %cst_5 : tensor<1x4096xi32> loc(#loc74) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc75) + %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32> loc(#loc115) + %tmp0_9 = arith.addi %r0_base_7, %tmp0_8 : tensor<1x4096xi32> loc(#loc76) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc77) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc77) + %tmp0_12 = tt.splat %xmask_6 : i1 -> tensor<1x4096xi1> loc(#loc116) + %tmp0_13 = arith.andi %r0_mask, %tmp0_12 : tensor<1x4096xi1> loc(#loc78) + %tmp0_14 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc79) + %tmp0_15 = arith.extf %tmp0_14 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc80) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc81) + %tmp1_16 = tt.addptr %tmp1, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc81) + %tmp1_17 = tt.load %tmp1_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc82) + %tmp1_18 = arith.extf %tmp1_17 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc83) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc84) + %tmp2_19 = tt.addptr %tmp2, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc84) + %tmp2_20 = tt.load %tmp2_19, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc85) + %tmp2_21 = arith.extf %tmp2_20 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc86) + %tmp3 = arith.mulf %tmp1_18, %tmp2_21 : tensor<1x4096xf32> loc(#loc87) + %tmp4 = arith.addf %tmp0_15, %tmp3 : tensor<1x4096xf32> loc(#loc88) + %tmp7_mean = arith.select %tmp0_13, %tmp4, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc89) + %tmp7_weight = arith.select %tmp0_13, %cst_2, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc90) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc24) + %1 = tt.addptr %0, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc24) + %2 = arith.truncf %tmp4 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc25) + tt.store %1, %2, %tmp0_13 : tensor<1x4096x!tt.ptr> loc(#loc25) + %3:3 = "tt.reduce"(%tmp7_mean, %cst_0, %tmp7_weight) <{axis = 1 : i32}> ({ + ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc3)), %arg10: f32 loc(callsite(#loc1 at #loc3)), %arg11: f32 loc(callsite(#loc1 at #loc3)), %arg12: f32 loc(callsite(#loc1 at #loc3)), %arg13: f32 loc(callsite(#loc1 at #loc3)), %arg14: f32 loc(callsite(#loc1 at #loc3))): + %delta = arith.subf %arg12, %arg9 : f32 loc(#loc117) + %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc118) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc119) + %w2_over_w_31 = arith.divf %arg14, %new_weight : f32 loc(#loc120) + %w2_over_w_32 = arith.select %w2_over_w, %cst, %w2_over_w_31 : f32 loc(#loc121) + %7 = arith.mulf %delta, %w2_over_w_32 : f32 loc(#loc122) + %8 = arith.addf %arg9, %7 : f32 loc(#loc123) + %9 = arith.addf %arg10, %arg13 : f32 loc(#loc124) + %10 = arith.mulf %delta, %delta : f32 loc(#loc125) + %11 = arith.mulf %10, %arg11 : f32 loc(#loc126) + %12 = arith.mulf %11, %w2_over_w_32 : f32 loc(#loc127) + %13 = arith.addf %9, %12 : f32 loc(#loc128) + tt.reduce.return %8, %13, %new_weight : f32, f32, f32 loc(#loc91) + }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc91) + %tmp7 = tt.expand_dims %3#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc97) + %tmp11 = tt.expand_dims %3#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc98) + %tmp13 = tt.load %1, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc99) + %tmp13_22 = arith.extf %tmp13 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc100) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc101) + %tmp23_23 = tt.addptr %tmp23, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc101) + %tmp23_24 = tt.load %tmp23_23, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc102) + %tmp23_25 = arith.extf %tmp23_24 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc103) + %tmp27 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc104) + %tmp27_26 = tt.addptr %tmp27, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc104) + %tmp27_27 = tt.load %tmp27_26, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc105) + %tmp27_28 = arith.extf %tmp27_27 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc106) + %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc107) + %tmp15_29 = arith.subf %tmp13_22, %tmp15 : tensor<1x4096xf32> loc(#loc107) + %tmp17 = arith.divf %tmp11, %cst_4 : tensor<1x1xf32> loc(#loc108) + %tmp19 = arith.addf %tmp17, %cst_3 : tensor<1x1xf32> loc(#loc109) + %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc110) + %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc111) + %tmp21_30 = arith.mulf %tmp15_29, %tmp21 : tensor<1x4096xf32> loc(#loc111) + %tmp25 = arith.addf %tmp23_25, %cst_2 : tensor<1x4096xf32> loc(#loc112) + %tmp26 = arith.mulf %tmp21_30, %tmp25 : tensor<1x4096xf32> loc(#loc113) + %tmp28 = arith.addf %tmp26, %tmp27_28 : tensor<1x4096xf32> loc(#loc114) + %4 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc57) + %5 = tt.addptr %4, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc57) + %6 = arith.truncf %tmp28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc58) + tt.store %5, %6, %tmp0_13 : tensor<1x4096x!tt.ptr> loc(#loc58) + tt.return loc(#loc59) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:27) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4) +#loc69 = loc("xmask"(#loc2)) +#loc71 = loc("xoffset"(#loc4)) +#loc72 = loc("r0_base"(#loc5)) +#loc73 = loc("r0_base"(#loc6)) +#loc74 = loc("r0_mask"(#loc7)) +#loc75 = loc("tmp0"(#loc8)) +#loc76 = loc("tmp0"(#loc9)) +#loc77 = loc("tmp0"(#loc10)) +#loc78 = loc("tmp0"(#loc11)) +#loc79 = loc("tmp0"(#loc12)) +#loc80 = loc("tmp0"(#loc13)) +#loc81 = loc("tmp1"(#loc14)) +#loc82 = loc("tmp1"(#loc15)) +#loc83 = loc("tmp1"(#loc16)) +#loc84 = loc("tmp2"(#loc17)) +#loc85 = loc("tmp2"(#loc18)) +#loc86 = loc("tmp2"(#loc19)) +#loc87 = loc("tmp3"(#loc20)) +#loc88 = loc("tmp4"(#loc21)) +#loc89 = loc("tmp7_mean"(#loc22)) +#loc90 = loc("tmp7_weight"(#loc23)) +#loc91 = loc(callsite(#loc26 at #loc3)) +#loc92 = loc("delta"(#loc27)) +#loc93 = loc("new_weight"(#loc28)) +#loc94 = loc("w2_over_w"(#loc29)) +#loc95 = loc("w2_over_w"(#loc30)) +#loc96 = loc("w2_over_w"(#loc31)) +#loc97 = loc("tmp7"(#loc39)) +#loc98 = loc("tmp11"(#loc40)) +#loc99 = loc("tmp13"(#loc41)) +#loc100 = loc("tmp13"(#loc42)) +#loc101 = loc("tmp23"(#loc43)) +#loc102 = loc("tmp23"(#loc44)) +#loc103 = loc("tmp23"(#loc45)) +#loc104 = loc("tmp27"(#loc46)) +#loc105 = loc("tmp27"(#loc47)) +#loc106 = loc("tmp27"(#loc48)) +#loc107 = loc("tmp15"(#loc49)) +#loc108 = loc("tmp17"(#loc50)) +#loc109 = loc("tmp19"(#loc51)) +#loc110 = loc("tmp20"(#loc52)) +#loc111 = loc("tmp21"(#loc53)) +#loc112 = loc("tmp25"(#loc54)) +#loc113 = loc("tmp26"(#loc55)) +#loc114 = loc("tmp28"(#loc56)) +#loc115 = loc(fused[#loc76, #loc75]) +#loc116 = loc(fused[#loc78, #loc69]) +#loc117 = loc(callsite(#loc92 at #loc91)) +#loc118 = loc(callsite(#loc93 at #loc91)) +#loc119 = loc(callsite(#loc94 at #loc91)) +#loc120 = loc(callsite(#loc95 at #loc91)) +#loc121 = loc(callsite(#loc96 at #loc91)) +#loc122 = loc(callsite(#loc32 at #loc91)) +#loc123 = loc(callsite(#loc33 at #loc91)) +#loc124 = loc(callsite(#loc34 at #loc91)) +#loc125 = loc(callsite(#loc35 at #loc91)) +#loc126 = loc(callsite(#loc36 at #loc91)) +#loc127 = loc(callsite(#loc37 at #loc91)) +#loc128 = loc(callsite(#loc38 at #loc91)) diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/__grp__triton_poi_fused_mul_silu_split_0.json b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/__grp__triton_poi_fused_mul_silu_split_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7267cdd791bac8f1790e5b0f1b90435656064c76 --- /dev/null +++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/__grp__triton_poi_fused_mul_silu_split_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_mul_silu_split_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.source", "triton_poi_fused_mul_silu_split_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttir", "triton_poi_fused_mul_silu_split_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttgir", "triton_poi_fused_mul_silu_split_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.llir", "triton_poi_fused_mul_silu_split_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ptx", "triton_poi_fused_mul_silu_split_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.cubin", "triton_poi_fused_mul_silu_split_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.json"}} \ No newline at end of file diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.cubin b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..fad16e1c39326b809a398992751ab5f0a0b2c6ba Binary files /dev/null and b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.cubin differ diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.json b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a181dfc740948536c72c45914f1f2c2984a88b7c --- /dev/null +++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.json @@ -0,0 +1 @@ +{"hash": "99eec3c84593e6b9bfc9daf2e57266e2a78847f098b6fd12ee0e3ec198f2ad8a", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_silu_split_0"} \ No newline at end of file diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.llir b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..8f13a69bdc963657c9c20ac9a05660641d7244cf --- /dev/null +++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.llir @@ -0,0 +1,102 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_mul_silu_split_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 9, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 1, !dbg !9 + %10 = and i32 %9, 510, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = srem i32 %11, 12288, !dbg !11 + %13 = sub nsw i32 %11, %12, !dbg !11 + %14 = add i32 %13, %11, !dbg !11 + %15 = sext i32 %14 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12 + %17 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %16) #3, !dbg !13 + %18 = bitcast i32 %17 to <2 x bfloat>, !dbg !13 + %19 = add i32 %14, 12288, !dbg !14 + %20 = sext i32 %19 to i64, !dbg !15 + %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !15 + %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %21) #3, !dbg !16 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !16 + %24 = sext i32 %11 to i64, !dbg !17 + %25 = getelementptr bfloat, ptr addrspace(1) %1, i64 %24, !dbg !17 + %26 = fpext <2 x bfloat> %18 to <2 x float>, !dbg !18 + %27 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !19 + %28 = extractelement <2 x float> %26, i64 0, !dbg !20 + %29 = fsub float 0.000000e+00, %28, !dbg !20 + %30 = extractelement <2 x float> %26, i64 1, !dbg !20 + %31 = fsub float 0.000000e+00, %30, !dbg !20 + %32 = fmul float %29, 0x3FF7154760000000, !dbg !25 + %33 = tail call float @llvm.nvvm.ex2.approx.f(float %32), !dbg !25 + %34 = fmul float %31, 0x3FF7154760000000, !dbg !25 + %35 = tail call float @llvm.nvvm.ex2.approx.f(float %34), !dbg !25 + %36 = fadd float %33, 1.000000e+00, !dbg !26 + %37 = fadd float %35, 1.000000e+00, !dbg !26 + %38 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %36), !dbg !27 + %39 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %37), !dbg !27 + %40 = insertelement <2 x float> poison, float %38, i64 0, !dbg !28 + %41 = insertelement <2 x float> %40, float %39, i64 1, !dbg !28 + %42 = fmul <2 x float> %41, %26, !dbg !28 + %43 = fmul <2 x float> %42, %27, !dbg !29 + %44 = fptrunc <2 x float> %43 to <2 x bfloat>, !dbg !30 + %45 = bitcast <2 x bfloat> %44 to i32, !dbg !30 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %45, ptr addrspace(1) %25) #3, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_silu_split_0", linkageName: "triton_poi_fused_mul_silu_split_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 19, column: 28, scope: !4) +!8 = !DILocation(line: 19, column: 33, scope: !4) +!9 = !DILocation(line: 20, column: 36, scope: !4) +!10 = !DILocation(line: 20, column: 23, scope: !4) +!11 = !DILocation(line: 25, column: 35, scope: !4) +!12 = !DILocation(line: 25, column: 30, scope: !4) +!13 = !DILocation(line: 25, column: 46, scope: !4) +!14 = !DILocation(line: 26, column: 43, scope: !4) +!15 = !DILocation(line: 26, column: 30, scope: !4) +!16 = !DILocation(line: 26, column: 54, scope: !4) +!17 = !DILocation(line: 32, column: 25, scope: !4) +!18 = !DILocation(line: 25, column: 55, scope: !4) +!19 = !DILocation(line: 26, column: 63, scope: !4) +!20 = !DILocation(line: 50, column: 30, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !4, file: !22, discriminator: 0) +!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!23 = !DILocation(line: 28, column: 22, scope: !24) +!24 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!25 = !DILocation(line: 50, column: 29, scope: !21, inlinedAt: !23) +!26 = !DILocation(line: 50, column: 20, scope: !21, inlinedAt: !23) +!27 = !DILocation(line: 50, column: 16, scope: !21, inlinedAt: !23) +!28 = !DILocation(line: 29, column: 18, scope: !4) +!29 = !DILocation(line: 31, column: 18, scope: !4) +!30 = !DILocation(line: 32, column: 36, scope: !4) +!31 = !DILocation(line: 32, column: 4, scope: !4) diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ptx b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..dd0ad117afc743c8bd09e3d2c446131c72bb23ab --- /dev/null +++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ptx @@ -0,0 +1,437 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_mul_silu_split_0 // -- Begin function triton_poi_fused_mul_silu_split_0 + // @triton_poi_fused_mul_silu_split_0 +.visible .entry triton_poi_fused_mul_silu_split_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_1, + .param .u32 triton_poi_fused_mul_silu_split_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_4 +) +.reqntid 256 +{ + .reg .b16 %rs<5>; + .reg .b32 %r<36>; + .reg .b64 %rd<6>; + .loc 1 18 0 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:18:0 +$L__func_begin0: + .loc 1 18 0 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_poi_fused_mul_silu_split_0_param_0]; + ld.param.b64 %rd5, [triton_poi_fused_mul_silu_split_0_param_1]; +$L__tmp0: + .loc 1 19 28 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:19:28 + mov.u32 %r4, %ctaid.x; + .loc 1 19 33 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:19:33 + shl.b32 %r5, %r4, 9; + .loc 1 20 36 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:20:36 + mov.u32 %r6, %tid.x; + shl.b32 %r7, %r6, 1; + and.b32 %r8, %r7, 510; + .loc 1 20 23 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:20:23 + or.b32 %r9, %r8, %r5; + .loc 1 25 35 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:35 + mul.hi.s32 %r10, %r9, 715827883; + shr.u32 %r11, %r10, 31; + shr.u32 %r12, %r10, 11; + add.s32 %r13, %r12, %r11; + mad.lo.s32 %r14, %r13, 12288, %r9; + .loc 1 25 30 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:30 + mad.wide.s32 %rd1, %r14, 2, %rd4; + .loc 1 25 46 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:46 + // begin inline asm + mov.u32 %r1, 0x0; + ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 26 43 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:43 + add.s32 %r15, %r14, 12288; + .loc 1 26 30 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:30 + mad.wide.s32 %rd2, %r15, 2, %rd4; + .loc 1 26 54 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:54 + // begin inline asm + mov.u32 %r2, 0x0; + ld.global.b32 { %r2 }, [ %rd2 + 0 ]; + // end inline asm + .loc 1 32 25 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:25 + mad.wide.s32 %rd3, %r9, 2, %rd5; + .loc 1 25 55 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:55 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r16, %rs2; + cvt.f32.bf16 %r17, %rs1; + .loc 1 26 63 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:63 + mov.b32 {%rs3, %rs4}, %r2; + cvt.f32.bf16 %r18, %rs4; + cvt.f32.bf16 %r19, %rs3; + mov.b32 %r20, 0f00000000; +$L__tmp1: + .loc 2 50 30 // standard.py:50:30 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + sub.f32 %r21, %r20, %r17; + sub.f32 %r22, %r20, %r16; + .loc 2 50 29 // standard.py:50:29 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + mul.f32 %r23, %r21, 0f3FB8AA3B; + ex2.approx.f32 %r24, %r23; + mul.f32 %r25, %r22, 0f3FB8AA3B; + ex2.approx.f32 %r26, %r25; + .loc 2 50 20 // standard.py:50:20 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + add.f32 %r27, %r24, 0f3F800000; + add.f32 %r28, %r26, 0f3F800000; + mov.b32 %r29, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ] + div.full.f32 %r30, %r29, %r27; + div.full.f32 %r31, %r29, %r28; +$L__tmp2: + .loc 1 29 18 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:29:18 + mul.f32 %r32, %r31, %r16; + mul.f32 %r33, %r30, %r17; + .loc 1 31 18 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:31:18 + mul.f32 %r34, %r33, %r19; + mul.f32 %r35, %r32, %r18; + .loc 1 32 36 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:36 + cvt.rn.bf16x2.f32 %r3, %r35, %r34; + // begin inline asm + st.global.b32 [ %rd3 + 0 ], { %r3 }; + // end inline asm + .loc 1 32 4 // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 307 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 115 +.b8 121 +.b8 97 +.b8 101 +.b8 51 +.b8 111 +.b8 107 +.b8 50 +.b8 120 +.b8 110 +.b8 122 +.b8 117 +.b8 120 +.b8 104 +.b8 106 +.b8 107 +.b8 120 +.b8 122 +.b8 104 +.b8 100 +.b8 99 +.b8 112 +.b8 99 +.b8 122 +.b8 54 +.b8 106 +.b8 99 +.b8 107 +.b8 99 +.b8 117 +.b8 51 +.b8 118 +.b8 118 +.b8 55 +.b8 101 +.b8 113 +.b8 98 +.b8 51 +.b8 112 +.b8 101 +.b8 119 +.b8 104 +.b8 114 +.b8 118 +.b8 113 +.b8 109 +.b8 105 +.b8 101 +.b8 114 +.b8 103 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 115 +.b8 121 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x24 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 111 +.b8 105 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 105 +.b8 108 +.b8 117 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 28 // DW_AT_call_line +.b8 22 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.source b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.source new file mode 100644 index 0000000000000000000000000000000000000000..7038e3a31164d9b44cbd605968faab87006025a0 --- /dev/null +++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.source @@ -0,0 +1,126 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0) +#loc32 = loc("in_ptr0"(#loc)) +#loc33 = loc("out_ptr0"(#loc)) +#loc34 = loc("xnumel"(#loc)) +#loc56 = loc("x"(#loc25)) +module { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc35) + %xoffset_0 = arith.constant 512 : i32 loc(#loc36) + %xoffset_1 = arith.constant 512 : i32 loc(#loc36) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc36) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc37) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc38) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc38) + %xmask = arith.constant true loc(#loc39) + %xmask_5 = arith.constant dense : tensor<512xi1> loc(#loc39) + %x0 = arith.constant 12288 : i32 loc(#loc40) + %x0_6 = arith.constant 12288 : i32 loc(#loc40) + %x0_7 = arith.constant dense<12288> : tensor<512xi32> loc(#loc40) + %x0_8 = arith.remsi %xindex_4, %x0_7 : tensor<512xi32> loc(#loc40) + %x1 = arith.constant 12288 : i32 loc(#loc41) + %x1_9 = arith.constant 12288 : i32 loc(#loc41) + %x1_10 = arith.constant dense<12288> : tensor<512xi32> loc(#loc41) + %x1_11 = arith.divsi %xindex_4, %x1_10 : tensor<512xi32> loc(#loc41) + %tmp0 = arith.constant 24576 : i32 loc(#loc42) + %tmp0_12 = arith.constant 24576 : i32 loc(#loc42) + %tmp0_13 = arith.constant dense<24576> : tensor<512xi32> loc(#loc42) + %tmp0_14 = arith.muli %tmp0_13, %x1_11 : tensor<512xi32> loc(#loc42) + %tmp0_15 = arith.addi %x0_8, %tmp0_14 : tensor<512xi32> loc(#loc43) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc44) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc44) + %tmp0_18 = tt.load %tmp0_17 : tensor<512x!tt.ptr> loc(#loc45) + %tmp0_19 = arith.extf %tmp0_18 : tensor<512xbf16> to tensor<512xf32> loc(#loc46) + %tmp5 = arith.constant 12288 : i32 loc(#loc47) + %tmp5_20 = arith.constant 12288 : i32 loc(#loc47) + %tmp5_21 = arith.constant dense<12288> : tensor<512xi32> loc(#loc47) + %tmp5_22 = arith.addi %tmp5_21, %x0_8 : tensor<512xi32> loc(#loc47) + %tmp5_23 = arith.constant 24576 : i32 loc(#loc48) + %tmp5_24 = arith.constant 24576 : i32 loc(#loc48) + %tmp5_25 = arith.constant dense<24576> : tensor<512xi32> loc(#loc48) + %tmp5_26 = arith.muli %tmp5_25, %x1_11 : tensor<512xi32> loc(#loc48) + %tmp5_27 = arith.addi %tmp5_22, %tmp5_26 : tensor<512xi32> loc(#loc49) + %tmp5_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc50) + %tmp5_29 = tt.addptr %tmp5_28, %tmp5_27 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc50) + %tmp5_30 = tt.load %tmp5_29 : tensor<512x!tt.ptr> loc(#loc51) + %tmp5_31 = arith.extf %tmp5_30 : tensor<512xbf16> to tensor<512xf32> loc(#loc52) + %tmp2 = tt.call @triton.language.standard.sigmoid__fp32S512S__(%tmp0_19) : (tensor<512xf32>) -> tensor<512xf32> loc(#loc53) + %tmp3 = arith.mulf %tmp0_19, %tmp2 : tensor<512xf32> loc(#loc54) + %tmp6 = arith.mulf %tmp3, %tmp5_31 : tensor<512xf32> loc(#loc55) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc22) + %1 = tt.addptr %0, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc22) + %2 = arith.truncf %tmp6 : tensor<512xf32> to tensor<512xbf16> loc(#loc23) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc23) + tt.return loc(#loc24) + } loc(#loc) + tt.func private @triton.language.standard.sigmoid__fp32S512S__(%x: tensor<512xf32> loc("x"(#loc25))) -> tensor<512xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc26) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc26) + %0 = arith.subf %cst_0, %x : tensor<512xf32> loc(#loc26) + %1 = math.exp %0 : tensor<512xf32> loc(#loc27) + %c1_i32 = arith.constant 1 : i32 loc(#loc28) + %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc28) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc28) + %2 = arith.addf %cst_2, %1 : tensor<512xf32> loc(#loc28) + %c1_i32_3 = arith.constant 1 : i32 loc(#loc29) + %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc29) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc29) + %3 = arith.divf %cst_5, %2 : tensor<512xf32> loc(#loc29) + tt.return %3 : tensor<512xf32> loc(#loc30) + ^bb1: // no predecessors + %4 = ub.poison : tensor<512xf32> loc(#loc31) + tt.return %4 : tensor<512xf32> loc(#loc31) + } loc(#loc25) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:49) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4) +#loc35 = loc("xoffset"(#loc1)) +#loc36 = loc("xoffset"(#loc2)) +#loc37 = loc("xindex"(#loc3)) +#loc38 = loc("xindex"(#loc4)) +#loc39 = loc("xmask"(#loc5)) +#loc40 = loc("x0"(#loc6)) +#loc41 = loc("x1"(#loc7)) +#loc42 = loc("tmp0"(#loc8)) +#loc43 = loc("tmp0"(#loc9)) +#loc44 = loc("tmp0"(#loc10)) +#loc45 = loc("tmp0"(#loc11)) +#loc46 = loc("tmp0"(#loc12)) +#loc47 = loc("tmp5"(#loc13)) +#loc48 = loc("tmp5"(#loc14)) +#loc49 = loc("tmp5"(#loc15)) +#loc50 = loc("tmp5"(#loc16)) +#loc51 = loc("tmp5"(#loc17)) +#loc52 = loc("tmp5"(#loc18)) +#loc53 = loc("tmp2"(#loc19)) +#loc54 = loc("tmp3"(#loc20)) +#loc55 = loc("tmp6"(#loc21)) diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttgir b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..a4aae66c87b9169498979273e3a61914413dbf3a --- /dev/null +++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttgir @@ -0,0 +1,93 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0) +#loc28 = loc("in_ptr0"(#loc)) +#loc29 = loc("out_ptr0"(#loc)) +#loc30 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<24576> : tensor<512xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<12288> : tensor<512xi32, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc31) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc32) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc33) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc34) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc34) + %x0 = arith.remsi %xindex_5, %cst_0 : tensor<512xi32, #blocked> loc(#loc35) + %x1 = arith.divsi %xindex_5, %cst_0 : tensor<512xi32, #blocked> loc(#loc36) + %tmp0 = arith.muli %x1, %cst : tensor<512xi32, #blocked> loc(#loc37) + %tmp0_6 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc38) + %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc39) + %tmp0_8 = tt.addptr %tmp0_7, %tmp0_6 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc39) + %tmp0_9 = tt.load %tmp0_8 : tensor<512x!tt.ptr, #blocked> loc(#loc40) + %tmp0_10 = arith.extf %tmp0_9 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc41) + %tmp5 = arith.addi %x0, %cst_0 : tensor<512xi32, #blocked> loc(#loc42) + %tmp5_11 = arith.addi %tmp5, %tmp0 : tensor<512xi32, #blocked> loc(#loc43) + %tmp5_12 = tt.addptr %tmp0_7, %tmp5_11 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc44) + %tmp5_13 = tt.load %tmp5_12 : tensor<512x!tt.ptr, #blocked> loc(#loc45) + %tmp5_14 = arith.extf %tmp5_13 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc46) + %tmp2 = arith.subf %cst_1, %tmp0_10 : tensor<512xf32, #blocked> loc(#loc50) + %tmp2_15 = math.exp %tmp2 : tensor<512xf32, #blocked> loc(#loc51) + %tmp2_16 = arith.addf %tmp2_15, %cst_2 : tensor<512xf32, #blocked> loc(#loc52) + %tmp2_17 = arith.divf %cst_2, %tmp2_16 : tensor<512xf32, #blocked> loc(#loc53) + %tmp3 = arith.mulf %tmp0_10, %tmp2_17 : tensor<512xf32, #blocked> loc(#loc48) + %tmp6 = arith.mulf %tmp3, %tmp5_14 : tensor<512xf32, #blocked> loc(#loc49) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc25) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc25) + %2 = arith.truncf %tmp6 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc26) + tt.store %1, %2 : tensor<512x!tt.ptr, #blocked> loc(#loc26) + tt.return loc(#loc27) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4) +#loc31 = loc("xoffset"(#loc2)) +#loc32 = loc("xoffset"(#loc3)) +#loc33 = loc("xindex"(#loc4)) +#loc34 = loc("xindex"(#loc5)) +#loc35 = loc("x0"(#loc6)) +#loc36 = loc("x1"(#loc7)) +#loc37 = loc("tmp0"(#loc8)) +#loc38 = loc("tmp0"(#loc9)) +#loc39 = loc("tmp0"(#loc10)) +#loc40 = loc("tmp0"(#loc11)) +#loc41 = loc("tmp0"(#loc12)) +#loc42 = loc("tmp5"(#loc13)) +#loc43 = loc("tmp5"(#loc14)) +#loc44 = loc("tmp5"(#loc15)) +#loc45 = loc("tmp5"(#loc16)) +#loc46 = loc("tmp5"(#loc17)) +#loc47 = loc("tmp2"(#loc19)) +#loc48 = loc("tmp3"(#loc23)) +#loc49 = loc("tmp6"(#loc24)) +#loc50 = loc(callsite(#loc18 at #loc47)) +#loc51 = loc(callsite(#loc20 at #loc47)) +#loc52 = loc(callsite(#loc21 at #loc47)) +#loc53 = loc(callsite(#loc22 at #loc47)) diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttir b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c4d7cd88e543358d1548392697db257fdae6edf7 --- /dev/null +++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttir @@ -0,0 +1,93 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0) +#loc28 = loc("in_ptr0"(#loc)) +#loc29 = loc("out_ptr0"(#loc)) +#loc30 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp2 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc50) + %tmp2_0 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc51) + %cst = arith.constant dense<24576> : tensor<512xi32> loc(#loc3) + %cst_1 = arith.constant dense<12288> : tensor<512xi32> loc(#loc3) + %c512_i32 = arith.constant 512 : i32 loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc32) + %xoffset_2 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc33) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc34) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc35) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc35) + %x0 = arith.remsi %xindex_4, %cst_1 : tensor<512xi32> loc(#loc36) + %x1 = arith.divsi %xindex_4, %cst_1 : tensor<512xi32> loc(#loc37) + %tmp0 = arith.muli %x1, %cst : tensor<512xi32> loc(#loc38) + %tmp0_5 = arith.addi %x0, %tmp0 : tensor<512xi32> loc(#loc39) + %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc40) + %tmp0_7 = tt.addptr %tmp0_6, %tmp0_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc40) + %tmp0_8 = tt.load %tmp0_7 : tensor<512x!tt.ptr> loc(#loc41) + %tmp0_9 = arith.extf %tmp0_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc42) + %tmp5 = arith.addi %x0, %cst_1 : tensor<512xi32> loc(#loc43) + %tmp5_10 = arith.addi %tmp5, %tmp0 : tensor<512xi32> loc(#loc44) + %tmp5_11 = tt.addptr %tmp0_6, %tmp5_10 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc45) + %tmp5_12 = tt.load %tmp5_11 : tensor<512x!tt.ptr> loc(#loc46) + %tmp5_13 = arith.extf %tmp5_12 : tensor<512xbf16> to tensor<512xf32> loc(#loc47) + %tmp2_14 = arith.subf %tmp2, %tmp0_9 : tensor<512xf32> loc(#loc50) + %tmp2_15 = math.exp %tmp2_14 : tensor<512xf32> loc(#loc52) + %tmp2_16 = arith.addf %tmp2_15, %tmp2_0 : tensor<512xf32> loc(#loc53) + %tmp2_17 = arith.divf %tmp2_0, %tmp2_16 : tensor<512xf32> loc(#loc54) + %tmp3 = arith.mulf %tmp0_9, %tmp2_17 : tensor<512xf32> loc(#loc48) + %tmp6 = arith.mulf %tmp3, %tmp5_13 : tensor<512xf32> loc(#loc49) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc25) + %1 = tt.addptr %0, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc25) + %2 = arith.truncf %tmp6 : tensor<512xf32> to tensor<512xbf16> loc(#loc26) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc26) + tt.return loc(#loc27) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22) +#loc3 = loc(unknown) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4) +#loc31 = loc("tmp2"(#loc2)) +#loc32 = loc("xoffset"(#loc4)) +#loc33 = loc("xoffset"(#loc5)) +#loc34 = loc("xindex"(#loc6)) +#loc35 = loc("xindex"(#loc7)) +#loc36 = loc("x0"(#loc8)) +#loc37 = loc("x1"(#loc9)) +#loc38 = loc("tmp0"(#loc10)) +#loc39 = loc("tmp0"(#loc11)) +#loc40 = loc("tmp0"(#loc12)) +#loc41 = loc("tmp0"(#loc13)) +#loc42 = loc("tmp0"(#loc14)) +#loc43 = loc("tmp5"(#loc15)) +#loc44 = loc("tmp5"(#loc16)) +#loc45 = loc("tmp5"(#loc17)) +#loc46 = loc("tmp5"(#loc18)) +#loc47 = loc("tmp5"(#loc19)) +#loc48 = loc("tmp3"(#loc23)) +#loc49 = loc("tmp6"(#loc24)) +#loc50 = loc(callsite(#loc1 at #loc31)) +#loc51 = loc(callsite(#loc3 at #loc31)) +#loc52 = loc(callsite(#loc20 at #loc31)) +#loc53 = loc(callsite(#loc21 at #loc31)) +#loc54 = loc(callsite(#loc22 at #loc31)) diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/__grp__triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ec772c40de90fa1ff7b8ed1856d34ade161c06e7 --- /dev/null +++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/__grp__triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.json"}} \ No newline at end of file diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..af6326acd5c3b4f2ec17c9c4db57a063c7f53228 Binary files /dev/null and b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.cubin differ diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.json new file mode 100644 index 0000000000000000000000000000000000000000..414f10fe396652097f7085a02939c38c196c6d83 --- /dev/null +++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.json @@ -0,0 +1 @@ +{"hash": "a6b1c7d709b164ac8e9c059c5e1dd878f395ed859c1689a40ef693f2a54b22fe", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"} \ No newline at end of file diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..dadb9701697cfc57034ffcadb05ab6b8cb284d32 --- /dev/null +++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.llir @@ -0,0 +1,547 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 { +__nv_rsqrtf.exit: + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %9 = icmp samesign ult i32 %8, 2304, !dbg !9 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %11 = and i32 %10, 511, !dbg !10 + %12 = and i32 %10, 31, !dbg !10 + %13 = lshr i32 %11, 5, !dbg !10 + %14 = shl nuw nsw i32 %10, 3, !dbg !10 + %15 = and i32 %14, 4088, !dbg !10 + %16 = shl i32 %8, 12, !dbg !11 + %17 = or disjoint i32 %15, %16, !dbg !12 + %18 = sext i32 %17 to i64, !dbg !13 + %19 = getelementptr bfloat, ptr addrspace(1) %0, i64 %18, !dbg !13 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %20, i1 %9) #6, !dbg !14 + %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !14 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !14 + %24 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !14 + %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !14 + %26 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !14 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !14 + %28 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !14 + %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !14 + %30 = extractelement <2 x bfloat> %23, i64 0, !dbg !14 + %31 = extractelement <2 x bfloat> %23, i64 1, !dbg !14 + %32 = extractelement <2 x bfloat> %25, i64 0, !dbg !14 + %33 = extractelement <2 x bfloat> %25, i64 1, !dbg !14 + %34 = extractelement <2 x bfloat> %27, i64 0, !dbg !14 + %35 = extractelement <2 x bfloat> %27, i64 1, !dbg !14 + %36 = extractelement <2 x bfloat> %29, i64 0, !dbg !14 + %37 = extractelement <2 x bfloat> %29, i64 1, !dbg !14 + %38 = fpext bfloat %30 to float, !dbg !15 + %39 = fpext bfloat %31 to float, !dbg !15 + %40 = fpext bfloat %32 to float, !dbg !15 + %41 = fpext bfloat %33 to float, !dbg !15 + %42 = fpext bfloat %34 to float, !dbg !15 + %43 = fpext bfloat %35 to float, !dbg !15 + %44 = fpext bfloat %36 to float, !dbg !15 + %45 = fpext bfloat %37 to float, !dbg !15 + %46 = select i1 %9, float %38, float 0.000000e+00, !dbg !16 + %47 = select i1 %9, float %39, float 0.000000e+00, !dbg !16 + %48 = select i1 %9, float %40, float 0.000000e+00, !dbg !16 + %49 = select i1 %9, float %41, float 0.000000e+00, !dbg !16 + %50 = select i1 %9, float %42, float 0.000000e+00, !dbg !16 + %51 = select i1 %9, float %43, float 0.000000e+00, !dbg !16 + %52 = select i1 %9, float %44, float 0.000000e+00, !dbg !16 + %53 = select i1 %9, float %45, float 0.000000e+00, !dbg !16 + %54 = select i1 %9, float 1.000000e+00, float 0.000000e+00, !dbg !17 + %55 = fsub float %47, %46, !dbg !18 + %56 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !24 + %57 = fcmp oeq float %56, 0.000000e+00, !dbg !25 + %58 = tail call float @llvm.nvvm.div.full(float %54, float %56), !dbg !26 + %59 = select i1 %57, float 0.000000e+00, float %58, !dbg !27 + %60 = fmul float %59, %55, !dbg !28 + %61 = fadd float %46, %60, !dbg !29 + %62 = fmul float %55, %55, !dbg !30 + %63 = fmul float %54, %62, !dbg !31 + %64 = fmul float %59, %63, !dbg !32 + %65 = fadd float %64, 0.000000e+00, !dbg !33 + %66 = fsub float %48, %61, !dbg !18 + %67 = select i1 %9, float 3.000000e+00, float 0.000000e+00, !dbg !24 + %68 = fcmp oeq float %67, 0.000000e+00, !dbg !25 + %69 = tail call float @llvm.nvvm.div.full(float %54, float %67), !dbg !26 + %70 = select i1 %68, float 0.000000e+00, float %69, !dbg !27 + %71 = fmul float %70, %66, !dbg !28 + %72 = fadd float %61, %71, !dbg !29 + %73 = fmul float %66, %66, !dbg !30 + %74 = fmul float %56, %73, !dbg !31 + %75 = fmul float %70, %74, !dbg !32 + %76 = fadd float %65, %75, !dbg !33 + %77 = fsub float %49, %72, !dbg !18 + %78 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !24 + %79 = fcmp oeq float %78, 0.000000e+00, !dbg !25 + %80 = tail call float @llvm.nvvm.div.full(float %54, float %78), !dbg !26 + %81 = select i1 %79, float 0.000000e+00, float %80, !dbg !27 + %82 = fmul float %81, %77, !dbg !28 + %83 = fadd float %72, %82, !dbg !29 + %84 = fmul float %77, %77, !dbg !30 + %85 = fmul float %67, %84, !dbg !31 + %86 = fmul float %81, %85, !dbg !32 + %87 = fadd float %76, %86, !dbg !33 + %88 = fsub float %50, %83, !dbg !18 + %89 = select i1 %9, float 5.000000e+00, float 0.000000e+00, !dbg !24 + %90 = fcmp oeq float %89, 0.000000e+00, !dbg !25 + %91 = tail call float @llvm.nvvm.div.full(float %54, float %89), !dbg !26 + %92 = select i1 %90, float 0.000000e+00, float %91, !dbg !27 + %93 = fmul float %92, %88, !dbg !28 + %94 = fadd float %83, %93, !dbg !29 + %95 = fmul float %88, %88, !dbg !30 + %96 = fmul float %78, %95, !dbg !31 + %97 = fmul float %92, %96, !dbg !32 + %98 = fadd float %87, %97, !dbg !33 + %99 = fsub float %51, %94, !dbg !18 + %100 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !24 + %101 = fcmp oeq float %100, 0.000000e+00, !dbg !25 + %102 = tail call float @llvm.nvvm.div.full(float %54, float %100), !dbg !26 + %103 = select i1 %101, float 0.000000e+00, float %102, !dbg !27 + %104 = fmul float %103, %99, !dbg !28 + %105 = fadd float %94, %104, !dbg !29 + %106 = fmul float %99, %99, !dbg !30 + %107 = fmul float %89, %106, !dbg !31 + %108 = fmul float %103, %107, !dbg !32 + %109 = fadd float %98, %108, !dbg !33 + %110 = fsub float %52, %105, !dbg !18 + %111 = select i1 %9, float 7.000000e+00, float 0.000000e+00, !dbg !24 + %112 = fcmp oeq float %111, 0.000000e+00, !dbg !25 + %113 = tail call float @llvm.nvvm.div.full(float %54, float %111), !dbg !26 + %114 = select i1 %112, float 0.000000e+00, float %113, !dbg !27 + %115 = fmul float %114, %110, !dbg !28 + %116 = fadd float %105, %115, !dbg !29 + %117 = fmul float %110, %110, !dbg !30 + %118 = fmul float %100, %117, !dbg !31 + %119 = fmul float %114, %118, !dbg !32 + %120 = fadd float %109, %119, !dbg !33 + %121 = fsub float %53, %116, !dbg !18 + %122 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !24 + %123 = fcmp oeq float %122, 0.000000e+00, !dbg !25 + %124 = tail call float @llvm.nvvm.div.full(float %54, float %122), !dbg !26 + %125 = select i1 %123, float 0.000000e+00, float %124, !dbg !27 + %126 = fmul float %125, %121, !dbg !28 + %127 = fadd float %116, %126, !dbg !29 + %128 = fmul float %121, %121, !dbg !30 + %129 = fmul float %111, %128, !dbg !31 + %130 = fmul float %125, %129, !dbg !32 + %131 = fadd float %120, %130, !dbg !33 + %132 = bitcast float %127 to i32, !dbg !21 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !21 + %134 = bitcast i32 %133 to float, !dbg !21 + %135 = bitcast float %131 to i32, !dbg !21 + %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 16, i32 31), !dbg !21 + %137 = bitcast i32 %136 to float, !dbg !21 + %138 = bitcast float %122 to i32, !dbg !21 + %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !21 + %140 = bitcast i32 %139 to float, !dbg !21 + %141 = fsub float %134, %127, !dbg !18 + %142 = fadd float %122, %140, !dbg !24 + %143 = fcmp oeq float %142, 0.000000e+00, !dbg !25 + %144 = tail call float @llvm.nvvm.div.full(float %140, float %142), !dbg !26 + %145 = select i1 %143, float 0.000000e+00, float %144, !dbg !27 + %146 = fmul float %145, %141, !dbg !28 + %147 = fadd float %127, %146, !dbg !29 + %148 = fadd float %131, %137, !dbg !34 + %149 = fmul float %141, %141, !dbg !30 + %150 = fmul float %122, %149, !dbg !31 + %151 = fmul float %145, %150, !dbg !32 + %152 = fadd float %148, %151, !dbg !33 + %153 = bitcast float %147 to i32, !dbg !21 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !21 + %155 = bitcast i32 %154 to float, !dbg !21 + %156 = bitcast float %152 to i32, !dbg !21 + %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 8, i32 31), !dbg !21 + %158 = bitcast i32 %157 to float, !dbg !21 + %159 = bitcast float %142 to i32, !dbg !21 + %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 8, i32 31), !dbg !21 + %161 = bitcast i32 %160 to float, !dbg !21 + %162 = fsub float %155, %147, !dbg !18 + %163 = fadd float %142, %161, !dbg !24 + %164 = fcmp oeq float %163, 0.000000e+00, !dbg !25 + %165 = tail call float @llvm.nvvm.div.full(float %161, float %163), !dbg !26 + %166 = select i1 %164, float 0.000000e+00, float %165, !dbg !27 + %167 = fmul float %166, %162, !dbg !28 + %168 = fadd float %147, %167, !dbg !29 + %169 = fadd float %152, %158, !dbg !34 + %170 = fmul float %162, %162, !dbg !30 + %171 = fmul float %142, %170, !dbg !31 + %172 = fmul float %166, %171, !dbg !32 + %173 = fadd float %169, %172, !dbg !33 + %174 = bitcast float %168 to i32, !dbg !21 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !21 + %176 = bitcast i32 %175 to float, !dbg !21 + %177 = bitcast float %173 to i32, !dbg !21 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 4, i32 31), !dbg !21 + %179 = bitcast i32 %178 to float, !dbg !21 + %180 = bitcast float %163 to i32, !dbg !21 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 4, i32 31), !dbg !21 + %182 = bitcast i32 %181 to float, !dbg !21 + %183 = fsub float %176, %168, !dbg !18 + %184 = fadd float %163, %182, !dbg !24 + %185 = fcmp oeq float %184, 0.000000e+00, !dbg !25 + %186 = tail call float @llvm.nvvm.div.full(float %182, float %184), !dbg !26 + %187 = select i1 %185, float 0.000000e+00, float %186, !dbg !27 + %188 = fmul float %187, %183, !dbg !28 + %189 = fadd float %168, %188, !dbg !29 + %190 = fadd float %173, %179, !dbg !34 + %191 = fmul float %183, %183, !dbg !30 + %192 = fmul float %163, %191, !dbg !31 + %193 = fmul float %187, %192, !dbg !32 + %194 = fadd float %190, %193, !dbg !33 + %195 = bitcast float %189 to i32, !dbg !21 + %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !21 + %197 = bitcast i32 %196 to float, !dbg !21 + %198 = bitcast float %194 to i32, !dbg !21 + %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 2, i32 31), !dbg !21 + %200 = bitcast i32 %199 to float, !dbg !21 + %201 = bitcast float %184 to i32, !dbg !21 + %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 2, i32 31), !dbg !21 + %203 = bitcast i32 %202 to float, !dbg !21 + %204 = fsub float %197, %189, !dbg !18 + %205 = fadd float %184, %203, !dbg !24 + %206 = fcmp oeq float %205, 0.000000e+00, !dbg !25 + %207 = tail call float @llvm.nvvm.div.full(float %203, float %205), !dbg !26 + %208 = select i1 %206, float 0.000000e+00, float %207, !dbg !27 + %209 = fmul float %208, %204, !dbg !28 + %210 = fadd float %189, %209, !dbg !29 + %211 = fadd float %194, %200, !dbg !34 + %212 = fmul float %204, %204, !dbg !30 + %213 = fmul float %184, %212, !dbg !31 + %214 = fmul float %208, %213, !dbg !32 + %215 = fadd float %211, %214, !dbg !33 + %216 = bitcast float %210 to i32, !dbg !21 + %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !21 + %218 = bitcast i32 %217 to float, !dbg !21 + %219 = bitcast float %215 to i32, !dbg !21 + %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !21 + %221 = bitcast i32 %220 to float, !dbg !21 + %222 = bitcast float %205 to i32, !dbg !21 + %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !21 + %224 = bitcast i32 %223 to float, !dbg !21 + %225 = fsub float %218, %210, !dbg !18 + %226 = fadd float %205, %224, !dbg !24 + %227 = fcmp oeq float %226, 0.000000e+00, !dbg !25 + %228 = tail call float @llvm.nvvm.div.full(float %224, float %226), !dbg !26 + %229 = select i1 %227, float 0.000000e+00, float %228, !dbg !27 + %230 = fmul float %229, %225, !dbg !28 + %231 = fadd float %210, %230, !dbg !29 + %232 = fadd float %215, %221, !dbg !34 + %233 = fmul float %225, %225, !dbg !30 + %234 = fmul float %205, %233, !dbg !31 + %235 = fmul float %229, %234, !dbg !32 + %236 = fadd float %232, %235, !dbg !33 + %237 = icmp eq i32 %12, 0, !dbg !21 + %238 = getelementptr float, ptr addrspace(3) @global_smem, i32 %13, !dbg !21 + %239 = bitcast float %231 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %238, <1 x i32> %239, i1 %237) #6, !dbg !21 + %240 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %13, !dbg !21 + %241 = bitcast float %236 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %240, <1 x i32> %241, i1 %237) #6, !dbg !21 + %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %13, !dbg !21 + %243 = bitcast float %226 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %243, i1 %237) #6, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %244 = icmp samesign ult i32 %11, 16, !dbg !21 + %245 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !21 + %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %244) #6, !dbg !21 + %247 = bitcast i32 %246 to float, !dbg !21 + %248 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %11, !dbg !21 + %249 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %248, i1 %244) #6, !dbg !21 + %250 = bitcast i32 %249 to float, !dbg !21 + %251 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %11, !dbg !21 + %252 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %251, i1 %244) #6, !dbg !21 + %253 = bitcast i32 %252 to float, !dbg !21 + %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !21 + %255 = bitcast i32 %254 to float, !dbg !21 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 8, i32 31), !dbg !21 + %257 = bitcast i32 %256 to float, !dbg !21 + %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 8, i32 31), !dbg !21 + %259 = bitcast i32 %258 to float, !dbg !21 + %260 = fsub float %255, %247, !dbg !18 + %261 = fadd float %253, %259, !dbg !24 + %262 = fcmp oeq float %261, 0.000000e+00, !dbg !25 + %263 = tail call float @llvm.nvvm.div.full(float %259, float %261), !dbg !26 + %264 = select i1 %262, float 0.000000e+00, float %263, !dbg !27 + %265 = fmul float %260, %264, !dbg !28 + %266 = fadd float %265, %247, !dbg !29 + %267 = fadd float %250, %257, !dbg !34 + %268 = fmul float %260, %260, !dbg !30 + %269 = fmul float %268, %253, !dbg !31 + %270 = fmul float %269, %264, !dbg !32 + %271 = fadd float %267, %270, !dbg !33 + %272 = bitcast float %266 to i32, !dbg !21 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !21 + %274 = bitcast i32 %273 to float, !dbg !21 + %275 = bitcast float %271 to i32, !dbg !21 + %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %275, i32 4, i32 31), !dbg !21 + %277 = bitcast i32 %276 to float, !dbg !21 + %278 = bitcast float %261 to i32, !dbg !21 + %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 4, i32 31), !dbg !21 + %280 = bitcast i32 %279 to float, !dbg !21 + %281 = fsub float %274, %266, !dbg !18 + %282 = fadd float %261, %280, !dbg !24 + %283 = fcmp oeq float %282, 0.000000e+00, !dbg !25 + %284 = tail call float @llvm.nvvm.div.full(float %280, float %282), !dbg !26 + %285 = select i1 %283, float 0.000000e+00, float %284, !dbg !27 + %286 = fmul float %281, %285, !dbg !28 + %287 = fadd float %266, %286, !dbg !29 + %288 = fadd float %271, %277, !dbg !34 + %289 = fmul float %281, %281, !dbg !30 + %290 = fmul float %261, %289, !dbg !31 + %291 = fmul float %285, %290, !dbg !32 + %292 = fadd float %288, %291, !dbg !33 + %293 = bitcast float %287 to i32, !dbg !21 + %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !21 + %295 = bitcast i32 %294 to float, !dbg !21 + %296 = bitcast float %292 to i32, !dbg !21 + %297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 2, i32 31), !dbg !21 + %298 = bitcast i32 %297 to float, !dbg !21 + %299 = bitcast float %282 to i32, !dbg !21 + %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 2, i32 31), !dbg !21 + %301 = bitcast i32 %300 to float, !dbg !21 + %302 = fsub float %295, %287, !dbg !18 + %303 = fadd float %282, %301, !dbg !24 + %304 = fcmp oeq float %303, 0.000000e+00, !dbg !25 + %305 = tail call float @llvm.nvvm.div.full(float %301, float %303), !dbg !26 + %306 = select i1 %304, float 0.000000e+00, float %305, !dbg !27 + %307 = fmul float %302, %306, !dbg !28 + %308 = fadd float %287, %307, !dbg !29 + %309 = fadd float %292, %298, !dbg !34 + %310 = fmul float %302, %302, !dbg !30 + %311 = fmul float %282, %310, !dbg !31 + %312 = fmul float %306, %311, !dbg !32 + %313 = fadd float %309, %312, !dbg !33 + %314 = bitcast float %308 to i32, !dbg !21 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !21 + %316 = bitcast i32 %315 to float, !dbg !21 + %317 = bitcast float %313 to i32, !dbg !21 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 1, i32 31), !dbg !21 + %319 = bitcast i32 %318 to float, !dbg !21 + %320 = bitcast float %303 to i32, !dbg !21 + %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %320, i32 1, i32 31), !dbg !21 + %322 = bitcast i32 %321 to float, !dbg !21 + %323 = fsub float %316, %308, !dbg !18 + %324 = fadd float %303, %322, !dbg !24 + %325 = fcmp oeq float %324, 0.000000e+00, !dbg !25 + %326 = tail call float @llvm.nvvm.div.full(float %322, float %324), !dbg !26 + %327 = select i1 %325, float 0.000000e+00, float %326, !dbg !27 + %328 = fmul float %323, %327, !dbg !28 + %329 = fadd float %308, %328, !dbg !29 + %330 = fadd float %313, %319, !dbg !34 + %331 = fmul float %323, %323, !dbg !30 + %332 = fmul float %303, %331, !dbg !31 + %333 = fmul float %327, %332, !dbg !32 + %334 = fadd float %330, %333, !dbg !33 + %335 = and i32 %10, 15, !dbg !21 + %336 = icmp eq i32 %335, 0, !dbg !21 + %337 = and i1 %244, %336, !dbg !21 + %338 = bitcast float %329 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %338, i1 %337) #6, !dbg !21 + %339 = bitcast float %334 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %248, <1 x i32> %339, i1 %337) #6, !dbg !21 + %340 = bitcast float %324 to <1 x i32>, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, <1 x i32> %340, i1 %337) #6, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %341 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !21 + %342 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !21 + %343 = zext nneg i32 %15 to i64, !dbg !35 + %344 = getelementptr bfloat, ptr addrspace(1) %1, i64 %343, !dbg !35 + %345 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !36 + %346 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %344, i64 %345, i1 true) #6, !dbg !36 + %347 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !37 + %348 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %347, i1 %9) #6, !dbg !37 + %349 = getelementptr bfloat, ptr addrspace(1) %2, i64 %343, !dbg !38 + %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !39 + %351 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !39 + %352 = tail call float @llvm.nvvm.div.full(float %342, float 4.096000e+03), !dbg !40 + %353 = fadd float %352, 0x3EB0C6F7A0000000, !dbg !41 + %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %356 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42 + %.not.i19 = icmp eq i32 %361, 0, !dbg !42 + br i1 %.not.i19, label %364, label %362, !dbg !42 + +362: ; preds = %__nv_rsqrtf.exit + %363 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %353), !dbg !42 + br label %__nv_rsqrtf.exit21, !dbg !42 + +364: ; preds = %__nv_rsqrtf.exit + %365 = tail call float @llvm.nvvm.rsqrt.approx.f(float %353), !dbg !42 + br label %__nv_rsqrtf.exit21, !dbg !42 + +__nv_rsqrtf.exit21: ; preds = %362, %364 + %.0.i20 = phi float [ %363, %362 ], [ %365, %364 ], !dbg !42 + %366 = extractvalue { i32, i32, i32, i32 } %348, 3, !dbg !37 + %367 = bitcast i32 %366 to <2 x bfloat>, !dbg !37 + %368 = extractvalue { i32, i32, i32, i32 } %348, 2, !dbg !37 + %369 = bitcast i32 %368 to <2 x bfloat>, !dbg !37 + %370 = extractvalue { i32, i32, i32, i32 } %348, 1, !dbg !37 + %371 = bitcast i32 %370 to <2 x bfloat>, !dbg !37 + %372 = extractvalue { i32, i32, i32, i32 } %348, 0, !dbg !37 + %373 = bitcast i32 %372 to <2 x bfloat>, !dbg !37 + %374 = extractvalue { i32, i32, i32, i32 } %346, 3, !dbg !36 + %375 = bitcast i32 %374 to <2 x bfloat>, !dbg !36 + %376 = extractvalue { i32, i32, i32, i32 } %346, 2, !dbg !36 + %377 = bitcast i32 %376 to <2 x bfloat>, !dbg !36 + %378 = extractvalue { i32, i32, i32, i32 } %346, 1, !dbg !36 + %379 = bitcast i32 %378 to <2 x bfloat>, !dbg !36 + %380 = extractvalue { i32, i32, i32, i32 } %346, 0, !dbg !36 + %381 = bitcast i32 %380 to <2 x bfloat>, !dbg !36 + %382 = extractvalue { i32, i32, i32, i32 } %351, 3, !dbg !39 + %383 = bitcast i32 %382 to <2 x bfloat>, !dbg !39 + %384 = extractvalue { i32, i32, i32, i32 } %351, 2, !dbg !39 + %385 = bitcast i32 %384 to <2 x bfloat>, !dbg !39 + %386 = extractvalue { i32, i32, i32, i32 } %351, 1, !dbg !39 + %387 = bitcast i32 %386 to <2 x bfloat>, !dbg !39 + %388 = extractvalue { i32, i32, i32, i32 } %351, 0, !dbg !39 + %389 = bitcast i32 %388 to <2 x bfloat>, !dbg !39 + %390 = getelementptr bfloat, ptr addrspace(1) %3, i64 %18, !dbg !43 + %391 = fpext <2 x bfloat> %373 to <2 x float>, !dbg !44 + %392 = insertelement <2 x float> poison, float %341, i64 0, !dbg !45 + %393 = shufflevector <2 x float> %392, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !45 + %394 = fsub <2 x float> %391, %393, !dbg !45 + %395 = fpext <2 x bfloat> %381 to <2 x float>, !dbg !46 + %396 = fadd <2 x float> %395, splat (float 1.000000e+00), !dbg !47 + %397 = fpext <2 x bfloat> %389 to <2 x float>, !dbg !48 + %398 = insertelement <2 x float> poison, float %.0.i20, i64 0, !dbg !49 + %399 = shufflevector <2 x float> %398, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !49 + %400 = fmul <2 x float> %394, %399, !dbg !49 + %401 = fmul <2 x float> %396, %400, !dbg !50 + %402 = fadd <2 x float> %401, %397, !dbg !51 + %403 = fptrunc <2 x float> %402 to <2 x bfloat>, !dbg !52 + %404 = fpext <2 x bfloat> %371 to <2 x float>, !dbg !44 + %405 = fsub <2 x float> %404, %393, !dbg !45 + %406 = fpext <2 x bfloat> %379 to <2 x float>, !dbg !46 + %407 = fadd <2 x float> %406, splat (float 1.000000e+00), !dbg !47 + %408 = fpext <2 x bfloat> %387 to <2 x float>, !dbg !48 + %409 = fmul <2 x float> %405, %399, !dbg !49 + %410 = fmul <2 x float> %407, %409, !dbg !50 + %411 = fadd <2 x float> %410, %408, !dbg !51 + %412 = fptrunc <2 x float> %411 to <2 x bfloat>, !dbg !52 + %413 = fpext <2 x bfloat> %369 to <2 x float>, !dbg !44 + %414 = fsub <2 x float> %413, %393, !dbg !45 + %415 = fpext <2 x bfloat> %377 to <2 x float>, !dbg !46 + %416 = fadd <2 x float> %415, splat (float 1.000000e+00), !dbg !47 + %417 = fpext <2 x bfloat> %385 to <2 x float>, !dbg !48 + %418 = fmul <2 x float> %414, %399, !dbg !49 + %419 = fmul <2 x float> %416, %418, !dbg !50 + %420 = fadd <2 x float> %419, %417, !dbg !51 + %421 = fptrunc <2 x float> %420 to <2 x bfloat>, !dbg !52 + %422 = fpext <2 x bfloat> %367 to <2 x float>, !dbg !44 + %423 = fsub <2 x float> %422, %393, !dbg !45 + %424 = fpext <2 x bfloat> %375 to <2 x float>, !dbg !46 + %425 = fadd <2 x float> %424, splat (float 1.000000e+00), !dbg !47 + %426 = fpext <2 x bfloat> %383 to <2 x float>, !dbg !48 + %427 = fmul <2 x float> %423, %399, !dbg !49 + %428 = fmul <2 x float> %425, %427, !dbg !50 + %429 = fadd <2 x float> %428, %426, !dbg !51 + %430 = fptrunc <2 x float> %429 to <2 x bfloat>, !dbg !52 + %431 = bitcast <2 x bfloat> %403 to i32, !dbg !52 + %432 = bitcast <2 x bfloat> %412 to i32, !dbg !52 + %433 = bitcast <2 x bfloat> %421 to i32, !dbg !52 + %434 = bitcast <2 x bfloat> %430 to i32, !dbg !52 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %431, i32 %432, i32 %433, i32 %434, ptr addrspace(1) %390, i1 %9) #6, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 25, column: 21, scope: !5) +!10 = !DILocation(line: 26, column: 37, scope: !5) +!11 = !DILocation(line: 38, column: 46, scope: !5) +!12 = !DILocation(line: 38, column: 41, scope: !5) +!13 = !DILocation(line: 38, column: 34, scope: !5) +!14 = !DILocation(line: 38, column: 51, scope: !5) +!15 = !DILocation(line: 38, column: 112, scope: !5) +!16 = !DILocation(line: 44, column: 62, scope: !5) +!17 = !DILocation(line: 46, column: 66, scope: !5) +!18 = !DILocation(line: 231, column: 21, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !5, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 243, column: 46, scope: !19, inlinedAt: !22) +!22 = !DILocation(line: 47, column: 79, scope: !23) +!23 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!24 = !DILocation(line: 232, column: 28, scope: !19, inlinedAt: !21) +!25 = !DILocation(line: 233, column: 39, scope: !19, inlinedAt: !21) +!26 = !DILocation(line: 233, column: 60, scope: !19, inlinedAt: !21) +!27 = !DILocation(line: 233, column: 49, scope: !19, inlinedAt: !21) +!28 = !DILocation(line: 235, column: 25, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 235, column: 17, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 236, column: 30, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 236, column: 38, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 236, column: 49, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 236, column: 22, scope: !19, inlinedAt: !21) +!34 = !DILocation(line: 236, column: 15, scope: !19, inlinedAt: !21) +!35 = !DILocation(line: 57, column: 34, scope: !5) +!36 = !DILocation(line: 57, column: 41, scope: !5) +!37 = !DILocation(line: 58, column: 52, scope: !5) +!38 = !DILocation(line: 59, column: 35, scope: !5) +!39 = !DILocation(line: 59, column: 42, scope: !5) +!40 = !DILocation(line: 65, column: 24, scope: !5) +!41 = !DILocation(line: 67, column: 24, scope: !5) +!42 = !DILocation(line: 68, column: 32, scope: !5) +!43 = !DILocation(line: 73, column: 29, scope: !5) +!44 = !DILocation(line: 58, column: 114, scope: !5) +!45 = !DILocation(line: 63, column: 24, scope: !5) +!46 = !DILocation(line: 57, column: 94, scope: !5) +!47 = !DILocation(line: 61, column: 23, scope: !5) +!48 = !DILocation(line: 59, column: 95, scope: !5) +!49 = !DILocation(line: 69, column: 24, scope: !5) +!50 = !DILocation(line: 71, column: 24, scope: !5) +!51 = !DILocation(line: 72, column: 24, scope: !5) +!52 = !DILocation(line: 73, column: 53, scope: !5) +!53 = !DILocation(line: 51, column: 4, scope: !5) diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..3b88e55748c4e5cea7c33f4ae223a07f59f126e9 --- /dev/null +++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ptx @@ -0,0 +1,1032 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused_add_mul_native_layer_norm_0 +.visible .entry triton_red_fused_add_mul_native_layer_norm_0( + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_4, + .param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_7 +) +.reqntid 512 +{ + .reg .pred %p<23>; + .reg .b16 %rs<33>; + .reg .b32 %r<287>; + .reg .b64 %rd<15>; + .loc 1 18 0 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd9, [triton_red_fused_add_mul_native_layer_norm_0_param_0]; + ld.param.b64 %rd10, [triton_red_fused_add_mul_native_layer_norm_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:23:28 + mov.u32 %r37, %ctaid.x; + .loc 1 25 21 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:25:21 + setp.lt.u32 %p1, %r37, 2304; + ld.param.b64 %rd11, [triton_red_fused_add_mul_native_layer_norm_0_param_2]; + ld.param.b64 %rd12, [triton_red_fused_add_mul_native_layer_norm_0_param_3]; + .loc 1 26 37 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:26:37 + mov.u32 %r38, %tid.x; + and.b32 %r39, %r38, 511; + and.b32 %r40, %r38, 31; + shl.b32 %r41, %r38, 3; + and.b32 %r42, %r41, 4088; + .loc 1 38 46 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:46 + shl.b32 %r43, %r37, 12; + .loc 1 38 41 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:41 + or.b32 %r44, %r42, %r43; + .loc 1 38 34 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:34 + mul.wide.s32 %rd13, %r44, 2; + add.s64 %rd1, %rd9, %rd13; + .loc 1 38 51 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:51 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + mov.b32 {%rs5, %rs6}, %r3; + mov.b32 {%rs7, %rs8}, %r4; + .loc 1 38 112 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112 + cvt.f32.bf16 %r45, %rs1; + cvt.f32.bf16 %r46, %rs2; + cvt.f32.bf16 %r47, %rs3; + cvt.f32.bf16 %r48, %rs4; + cvt.f32.bf16 %r49, %rs5; + cvt.f32.bf16 %r50, %rs6; + cvt.f32.bf16 %r51, %rs7; + cvt.f32.bf16 %r52, %rs8; + .loc 1 44 62 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:44:62 + selp.f32 %r53, %r45, 0f00000000, %p1; + selp.f32 %r54, %r46, 0f00000000, %p1; + selp.f32 %r55, %r47, 0f00000000, %p1; + selp.f32 %r56, %r48, 0f00000000, %p1; + selp.f32 %r57, %r49, 0f00000000, %p1; + selp.f32 %r58, %r50, 0f00000000, %p1; + selp.f32 %r59, %r51, 0f00000000, %p1; + selp.f32 %r60, %r52, 0f00000000, %p1; + .loc 1 46 66 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:46:66 + selp.f32 %r61, 0f3F800000, 0f00000000, %p1; +$L__tmp1: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r62, %r54, %r53; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r63, 0f40000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p6, %r63, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r64, %r61, %r63; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r65, 0f00000000, %r64, %p6; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r66, %r65, %r62, %r53; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r67, %r62, %r62; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r68, %r61, %r67; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r69, %r65, %r68, 0f00000000; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r70, %r55, %r66; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r71, 0f40400000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p7, %r71, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r72, %r61, %r71; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r73, 0f00000000, %r72, %p7; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r74, %r73, %r70, %r66; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r75, %r70, %r70; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r76, %r63, %r75; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r77, %r73, %r76, %r69; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r78, %r56, %r74; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r79, 0f40800000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p8, %r79, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r80, %r61, %r79; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r81, 0f00000000, %r80, %p8; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r82, %r81, %r78, %r74; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r83, %r78, %r78; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r84, %r71, %r83; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r85, %r81, %r84, %r77; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r86, %r57, %r82; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r87, 0f40A00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p9, %r87, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r88, %r61, %r87; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r89, 0f00000000, %r88, %p9; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r90, %r89, %r86, %r82; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r91, %r86, %r86; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r92, %r79, %r91; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r93, %r89, %r92, %r85; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r94, %r58, %r90; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r95, 0f40C00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p10, %r95, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r96, %r61, %r95; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r97, 0f00000000, %r96, %p10; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r98, %r97, %r94, %r90; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r99, %r94, %r94; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r100, %r87, %r99; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r101, %r97, %r100, %r93; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r102, %r59, %r98; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r103, 0f40E00000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p11, %r103, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r104, %r61, %r103; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r105, 0f00000000, %r104, %p11; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r106, %r105, %r102, %r98; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r107, %r102, %r102; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r108, %r95, %r107; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r109, %r105, %r108, %r101; + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r110, %r60, %r106; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r111, 0f41000000, 0f00000000, %p1; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p12, %r111, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r112, %r61, %r111; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r113, 0f00000000, %r112, %p12; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r114, %r113, %r110, %r106; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r115, %r110, %r110; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r116, %r103, %r115; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r117, %r113, %r116, %r109; +$L__tmp2: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r118, %r114, 16, 31, -1; + shfl.sync.bfly.b32 %r119, %r117, 16, 31, -1; + shfl.sync.bfly.b32 %r120, %r111, 16, 31, -1; +$L__tmp3: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r121, %r118, %r114; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r122, %r111, %r120; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p13, %r122, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r123, %r120, %r122; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r124, 0f00000000, %r123, %p13; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r125, %r124, %r121, %r114; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r126, %r117, %r119; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r127, %r121, %r121; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r128, %r111, %r127; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r129, %r124, %r128, %r126; +$L__tmp4: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r130, %r125, 8, 31, -1; + shfl.sync.bfly.b32 %r131, %r129, 8, 31, -1; + shfl.sync.bfly.b32 %r132, %r122, 8, 31, -1; +$L__tmp5: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r133, %r130, %r125; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r134, %r122, %r132; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p14, %r134, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r135, %r132, %r134; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r136, 0f00000000, %r135, %p14; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r137, %r136, %r133, %r125; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r138, %r129, %r131; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r139, %r133, %r133; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r140, %r122, %r139; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r141, %r136, %r140, %r138; +$L__tmp6: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r142, %r137, 4, 31, -1; + shfl.sync.bfly.b32 %r143, %r141, 4, 31, -1; + shfl.sync.bfly.b32 %r144, %r134, 4, 31, -1; +$L__tmp7: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r145, %r142, %r137; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r146, %r134, %r144; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p15, %r146, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r147, %r144, %r146; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r148, 0f00000000, %r147, %p15; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r149, %r148, %r145, %r137; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r150, %r141, %r143; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r151, %r145, %r145; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r152, %r134, %r151; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r153, %r148, %r152, %r150; +$L__tmp8: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r154, %r149, 2, 31, -1; + shfl.sync.bfly.b32 %r155, %r153, 2, 31, -1; + shfl.sync.bfly.b32 %r156, %r146, 2, 31, -1; +$L__tmp9: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r157, %r154, %r149; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r158, %r146, %r156; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p16, %r158, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r159, %r156, %r158; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r160, 0f00000000, %r159, %p16; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r161, %r160, %r157, %r149; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r162, %r153, %r155; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r163, %r157, %r157; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r164, %r146, %r163; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r165, %r160, %r164, %r162; +$L__tmp10: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r166, %r161, 1, 31, -1; + shfl.sync.bfly.b32 %r167, %r165, 1, 31, -1; + shfl.sync.bfly.b32 %r168, %r158, 1, 31, -1; +$L__tmp11: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r169, %r166, %r161; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r11, %r158, %r168; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p17, %r11, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r170, %r168, %r11; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r171, 0f00000000, %r170, %p17; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r7, %r171, %r169, %r161; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r172, %r165, %r167; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r173, %r169, %r169; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r174, %r158, %r173; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r9, %r171, %r174, %r172; +$L__tmp12: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + setp.eq.b32 %p2, %r40, 0; + shr.u32 %r175, %r38, 3; + and.b32 %r176, %r175, 60; + mov.b32 %r177, global_smem; + add.s32 %r6, %r177, %r176; + // begin inline asm + @%p2 st.shared.b32 [ %r6 + 0 ], %r7; + // end inline asm + add.s32 %r8, %r6, 64; + // begin inline asm + @%p2 st.shared.b32 [ %r8 + 0 ], %r9; + // end inline asm + add.s32 %r10, %r6, 128; + // begin inline asm + @%p2 st.shared.b32 [ %r10 + 0 ], %r11; + // end inline asm + bar.sync 0; + setp.lt.u32 %p3, %r39, 16; + shl.b32 %r178, %r39, 2; + add.s32 %r13, %r177, %r178; + // begin inline asm + @%p3 ld.shared.b32 %r12, [ %r13 + 0 ]; + // end inline asm + add.s32 %r15, %r13, 64; + // begin inline asm + @%p3 ld.shared.b32 %r14, [ %r15 + 0 ]; + // end inline asm + add.s32 %r17, %r13, 128; + // begin inline asm + @%p3 ld.shared.b32 %r16, [ %r17 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r179, %r12, 8, 31, -1; + shfl.sync.bfly.b32 %r180, %r14, 8, 31, -1; + shfl.sync.bfly.b32 %r181, %r16, 8, 31, -1; +$L__tmp13: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r182, %r179, %r12; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r183, %r16, %r181; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p18, %r183, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r184, %r181, %r183; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r185, 0f00000000, %r184, %p18; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r186, %r182, %r185, %r12; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r187, %r14, %r180; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r188, %r182, %r182; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r189, %r188, %r16; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r190, %r189, %r185, %r187; +$L__tmp14: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r191, %r186, 4, 31, -1; + shfl.sync.bfly.b32 %r192, %r190, 4, 31, -1; + shfl.sync.bfly.b32 %r193, %r183, 4, 31, -1; +$L__tmp15: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r194, %r191, %r186; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r195, %r183, %r193; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p19, %r195, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r196, %r193, %r195; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r197, 0f00000000, %r196, %p19; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r198, %r194, %r197, %r186; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r199, %r190, %r192; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r200, %r194, %r194; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r201, %r183, %r200; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r202, %r197, %r201, %r199; +$L__tmp16: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r203, %r198, 2, 31, -1; + shfl.sync.bfly.b32 %r204, %r202, 2, 31, -1; + shfl.sync.bfly.b32 %r205, %r195, 2, 31, -1; +$L__tmp17: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r206, %r203, %r198; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r207, %r195, %r205; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p20, %r207, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r208, %r205, %r207; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r209, 0f00000000, %r208, %p20; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r210, %r206, %r209, %r198; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r211, %r202, %r204; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r212, %r206, %r206; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r213, %r195, %r212; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r214, %r209, %r213, %r211; +$L__tmp18: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + shfl.sync.bfly.b32 %r215, %r210, 1, 31, -1; + shfl.sync.bfly.b32 %r216, %r214, 1, 31, -1; + shfl.sync.bfly.b32 %r217, %r207, 1, 31, -1; +$L__tmp19: + .loc 2 231 21 // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + sub.f32 %r218, %r215, %r210; + .loc 2 232 28 // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r20, %r207, %r217; + .loc 2 233 39 // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + setp.eq.f32 %p21, %r20, 0f00000000; + .loc 2 233 60 // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + div.full.f32 %r219, %r217, %r20; + .loc 2 233 49 // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + selp.f32 %r220, 0f00000000, %r219, %p21; + .loc 2 235 17 // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r18, %r218, %r220, %r210; + .loc 2 236 15 // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + add.f32 %r221, %r214, %r216; + .loc 2 236 30 // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r222, %r218, %r218; + .loc 2 236 38 // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + mul.f32 %r223, %r207, %r222; + .loc 2 236 22 // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ] + fma.rn.f32 %r19, %r220, %r223, %r221; +$L__tmp20: + .loc 2 243 46 // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] + and.b32 %r224, %r38, 15; + setp.eq.b32 %p22, %r224, 0; + and.pred %p4, %p3, %p22; + // begin inline asm + @%p4 st.shared.b32 [ %r13 + 0 ], %r18; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r15 + 0 ], %r19; + // end inline asm + // begin inline asm + @%p4 st.shared.b32 [ %r17 + 0 ], %r20; + // end inline asm + bar.sync 0; + ld.shared.b32 %r225, [global_smem]; + ld.shared.b32 %r226, [global_smem+64]; +$L__tmp21: + .loc 1 57 34 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:34 + mul.wide.u32 %rd14, %r42, 2; + add.s64 %rd3, %rd10, %rd14; + .loc 1 57 41 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:41 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + mov.pred %p5, -1; + // begin inline asm + mov.u32 %r21, %r5; + mov.u32 %r22, %r5; + mov.u32 %r23, %r5; + mov.u32 %r24, %r5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r21, %r22, %r23, %r24 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 58 52 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:52 + // begin inline asm + mov.u64 %rd5, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd5, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r5; + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd1 + 0 ], %rd5; + // end inline asm + .loc 1 59 35 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:35 + add.s64 %rd6, %rd11, %rd14; + .loc 1 59 42 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:42 + // begin inline asm + mov.u64 %rd7, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r29, %r5; + mov.u32 %r30, %r5; + mov.u32 %r31, %r5; + mov.u32 %r32, %r5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r29, %r30, %r31, %r32 }, [ %rd6 + 0 ], %rd7; + // end inline asm + mov.b32 %r227, 0f45800000; + .loc 1 65 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:65:24 + div.full.f32 %r228, %r226, %r227; + .loc 1 67 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:67:24 + add.f32 %r229, %r228, 0f358637BD; + .loc 1 68 32 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:68:32 + rsqrt.approx.ftz.f32 %r230, %r229; + .loc 1 73 29 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:29 + add.s64 %rd8, %rd12, %rd13; + .loc 1 58 114 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114 + mov.b32 {%rs9, %rs10}, %r25; + cvt.f32.bf16 %r231, %rs10; + cvt.f32.bf16 %r232, %rs9; + .loc 1 63 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24 + sub.f32 %r233, %r232, %r225; + sub.f32 %r234, %r231, %r225; + .loc 1 57 94 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94 + mov.b32 {%rs11, %rs12}, %r21; + cvt.f32.bf16 %r235, %rs11; + cvt.f32.bf16 %r236, %rs12; + .loc 1 61 23 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23 + add.f32 %r237, %r236, 0f3F800000; + add.f32 %r238, %r235, 0f3F800000; + .loc 1 59 95 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95 + mov.b32 {%rs13, %rs14}, %r29; + cvt.f32.bf16 %r239, %rs14; + cvt.f32.bf16 %r240, %rs13; + .loc 1 69 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24 + mul.f32 %r241, %r234, %r230; + mul.f32 %r242, %r233, %r230; + .loc 1 72 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24 + fma.rn.f32 %r243, %r238, %r242, %r240; + fma.rn.f32 %r244, %r237, %r241, %r239; + .loc 1 73 53 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53 + cvt.rn.bf16x2.f32 %r33, %r244, %r243; + .loc 1 58 114 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114 + mov.b32 {%rs15, %rs16}, %r26; + cvt.f32.bf16 %r245, %rs16; + cvt.f32.bf16 %r246, %rs15; + .loc 1 63 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24 + sub.f32 %r247, %r246, %r225; + sub.f32 %r248, %r245, %r225; + .loc 1 57 94 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94 + mov.b32 {%rs17, %rs18}, %r22; + cvt.f32.bf16 %r249, %rs17; + cvt.f32.bf16 %r250, %rs18; + .loc 1 61 23 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23 + add.f32 %r251, %r250, 0f3F800000; + add.f32 %r252, %r249, 0f3F800000; + .loc 1 59 95 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95 + mov.b32 {%rs19, %rs20}, %r30; + cvt.f32.bf16 %r253, %rs20; + cvt.f32.bf16 %r254, %rs19; + .loc 1 69 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24 + mul.f32 %r255, %r248, %r230; + mul.f32 %r256, %r247, %r230; + .loc 1 72 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24 + fma.rn.f32 %r257, %r252, %r256, %r254; + fma.rn.f32 %r258, %r251, %r255, %r253; + .loc 1 73 53 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53 + cvt.rn.bf16x2.f32 %r34, %r258, %r257; + .loc 1 58 114 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114 + mov.b32 {%rs21, %rs22}, %r27; + cvt.f32.bf16 %r259, %rs22; + cvt.f32.bf16 %r260, %rs21; + .loc 1 63 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24 + sub.f32 %r261, %r260, %r225; + sub.f32 %r262, %r259, %r225; + .loc 1 57 94 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94 + mov.b32 {%rs23, %rs24}, %r23; + cvt.f32.bf16 %r263, %rs23; + cvt.f32.bf16 %r264, %rs24; + .loc 1 61 23 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23 + add.f32 %r265, %r264, 0f3F800000; + add.f32 %r266, %r263, 0f3F800000; + .loc 1 59 95 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95 + mov.b32 {%rs25, %rs26}, %r31; + cvt.f32.bf16 %r267, %rs26; + cvt.f32.bf16 %r268, %rs25; + .loc 1 69 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24 + mul.f32 %r269, %r262, %r230; + mul.f32 %r270, %r261, %r230; + .loc 1 72 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24 + fma.rn.f32 %r271, %r266, %r270, %r268; + fma.rn.f32 %r272, %r265, %r269, %r267; + .loc 1 73 53 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53 + cvt.rn.bf16x2.f32 %r35, %r272, %r271; + .loc 1 58 114 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114 + mov.b32 {%rs27, %rs28}, %r28; + cvt.f32.bf16 %r273, %rs28; + cvt.f32.bf16 %r274, %rs27; + .loc 1 63 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24 + sub.f32 %r275, %r274, %r225; + sub.f32 %r276, %r273, %r225; + .loc 1 57 94 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94 + mov.b32 {%rs29, %rs30}, %r24; + cvt.f32.bf16 %r277, %rs29; + cvt.f32.bf16 %r278, %rs30; + .loc 1 61 23 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23 + add.f32 %r279, %r278, 0f3F800000; + add.f32 %r280, %r277, 0f3F800000; + .loc 1 59 95 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95 + mov.b32 {%rs31, %rs32}, %r32; + cvt.f32.bf16 %r281, %rs32; + cvt.f32.bf16 %r282, %rs31; + .loc 1 69 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24 + mul.f32 %r283, %r276, %r230; + mul.f32 %r284, %r275, %r230; + .loc 1 72 24 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24 + fma.rn.f32 %r285, %r280, %r284, %r282; + fma.rn.f32 %r286, %r279, %r283, %r281; + .loc 1 73 53 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53 + cvt.rn.bf16x2.f32 %r36, %r286, %r285; + // begin inline asm + @%p1 st.global.v4.b32 [ %rd8 + 0 ], { %r33, %r34, %r35, %r36 }; + // end inline asm + .loc 1 51 4 // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:51:4 + ret; +$L__tmp22: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 343 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x150 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 112 +.b8 103 +.b8 101 +.b8 115 +.b8 122 +.b8 104 +.b8 52 +.b8 110 +.b8 112 +.b8 121 +.b8 110 +.b8 121 +.b8 55 +.b8 117 +.b8 50 +.b8 113 +.b8 120 +.b8 108 +.b8 107 +.b8 116 +.b8 112 +.b8 118 +.b8 50 +.b8 121 +.b8 50 +.b8 120 +.b8 100 +.b8 103 +.b8 103 +.b8 122 +.b8 121 +.b8 108 +.b8 53 +.b8 111 +.b8 112 +.b8 111 +.b8 121 +.b8 51 +.b8 111 +.b8 114 +.b8 117 +.b8 113 +.b8 115 +.b8 113 +.b8 101 +.b8 116 +.b8 52 +.b8 112 +.b8 53 +.b8 101 +.b8 107 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 112 +.b8 103 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 97 +.b8 116 +.b8 105 +.b8 118 +.b8 101 +.b8 95 +.b8 108 +.b8 97 +.b8 121 +.b8 101 +.b8 114 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x113:0x47 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x128:0x31 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp21 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 47 // DW_AT_call_line +.b8 79 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x140:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp20 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 243 // DW_AT_call_line +.b8 46 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.source new file mode 100644 index 0000000000000000000000000000000000000000..5dace2e3fc4776bcfa7ab8e79ab933d32ddae36f --- /dev/null +++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.source @@ -0,0 +1,420 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0) +#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0) +#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0) +#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0) +#loc91 = loc(unknown) +#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0) +#loc109 = loc("in_ptr0"(#loc)) +#loc110 = loc("in_ptr1"(#loc)) +#loc111 = loc("in_ptr2"(#loc)) +#loc112 = loc("out_ptr2"(#loc)) +#loc113 = loc("xnumel"(#loc)) +#loc114 = loc("r0_numel"(#loc)) +#loc171 = loc("value"(#loc72)) +#loc172 = loc("mean"(#loc72)) +#loc173 = loc("m2"(#loc72)) +#loc174 = loc("weight"(#loc72)) +#loc175 = loc("first_iteration"(#loc72)) +#loc185 = loc("input"(#loc85)) +#loc186 = loc("mean"(#loc89)) +#loc187 = loc("m2"(#loc89)) +#loc188 = loc("weight"(#loc89)) +#loc189 = loc("mean_1"(#loc94)) +#loc190 = loc("m2_1"(#loc94)) +#loc191 = loc("weight_1"(#loc94)) +#loc192 = loc("mean_2"(#loc94)) +#loc193 = loc("m2_2"(#loc94)) +#loc194 = loc("weight_2"(#loc94)) +#loc201 = loc("new_mean"(#loc171)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2304 : i32 loc(#loc115) + %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116) + %xoffset = tt.get_program_id x : i32 loc(#loc117) + %xoffset_2 = arith.constant 1 : i32 loc(#loc118) + %xoffset_3 = arith.constant 1 : i32 loc(#loc118) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121) + %xmask = arith.constant dense<2304> : tensor<1x1xi32> loc(#loc122) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc123) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc124) + %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc125) + %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc126) + %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc127) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c4096_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc129) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc129) + %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc130) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x4096xi32> loc(#loc130) + %tmp0 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_18 = arith.constant 4096 : i32 loc(#loc131) + %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc132) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x4096xi32> loc(#loc132) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc133) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc133) + %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc134) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x4096xi1> loc(#loc134) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc135) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc135) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc135) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc136) + %c0_i32_32 = arith.constant 0 : i32 loc(#loc23) + %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23) + %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, i1) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) loc(#loc24) + %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc137) + %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x4096xi1> loc(#loc137) + %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc138) + %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc139) + %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x4096xi1> loc(#loc139) + %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc140) + %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc141) + %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x4096xi1> loc(#loc141) + %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc142) + scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc31) + } loc(#loc207) + %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143) + %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144) + %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc36) + %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc36) + %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36) + %7 = arith.bitcast %c4096_i32_12 : i32 to i32 loc(#loc36) + %8 = ub.poison : i32 loc(#loc36) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc146) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc146) + %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc147) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x4096xi32> loc(#loc147) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc148) + %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc148) + %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149) + %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc149) + %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc149) + %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc149) + %tmp9_20 = arith.extf %tmp9_19 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc150) + %tmp12 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_21 = arith.constant 4096 : i32 loc(#loc151) + %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151) + %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151) + %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc152) + %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x4096xi32> loc(#loc152) + %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc153) + %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc153) + %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc154) + %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x4096xi1> loc(#loc154) + %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc155) + %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc155) + %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc155) + %tmp12_34 = arith.extf %tmp12_33 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc156) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc157) + %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc157) + %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc158) + %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc158) + %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc158) + %tmp23_40 = arith.extf %tmp23_39 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc159) + %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160) + %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc161) + %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x4096xf32> loc(#loc161) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc162) + %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x4096xf32> loc(#loc162) + %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163) + %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164) + %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164) + %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165) + %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166) + %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166) + %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc168) + %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x4096xf32> loc(#loc168) + %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x4096xf32> loc(#loc169) + %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x4096xf32> loc(#loc170) + %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62) + %c4096_i32_47 = arith.constant 4096 : i32 loc(#loc62) + %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc63) + %11 = arith.addi %r0_index_13, %10 : tensor<1x4096xi32> loc(#loc63) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc64) + %13 = tt.addptr %12, %11 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc64) + %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc65) + %15 = arith.andi %r0_mask_14, %14 : tensor<1x4096xi1> loc(#loc65) + %16 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc66) + tt.store %13, %16, %15 : tensor<1x4096x!tt.ptr> loc(#loc66) + } loc(#loc36) + tt.return loc(#loc67) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() -> tensor<1x4096xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc69) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc69) + tt.return %cst_0 : tensor<1x4096xf32> loc(#loc70) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x4096xf32> loc(#loc71) + tt.return %0 : tensor<1x4096xf32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%new_mean: tensor<1x4096xf32> loc("new_mean"(#loc171)), %mean: tensor<1x4096xf32> loc("mean"(#loc72)), %m2: tensor<1x4096xf32> loc("m2"(#loc72)), %weight: tensor<1x4096xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) attributes {noinline = false} { + %0:3 = scf.if %first_iteration -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) { + %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176) + %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc202) + %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_4096S__(%m2) : (tensor<1x4096xf32>) -> tensor<1x4096xf32> loc(#loc203) + scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc203) + } else { + %delta = arith.subf %new_mean, %mean : tensor<1x4096xf32> loc(#loc178) + %new_weight = arith.constant 1 : i32 loc(#loc179) + %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179) + %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc179) + %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x4096xf32> loc(#loc204) + %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x4096xf32> loc(#loc180) + %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x4096xf32> loc(#loc205) + %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x4096xf32> loc(#loc182) + %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x4096xf32> loc(#loc183) + %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x4096xf32> loc(#loc206) + scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc184) + } loc(#loc73) + tt.return %0#1, %0#0, %0#2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x4096xf32> loc(#loc84) + %2 = ub.poison : tensor<1x4096xf32> loc(#loc84) + %3 = ub.poison : tensor<1x4096xf32> loc(#loc84) + tt.return %1, %2, %3 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc84) + } loc(#loc72) + tt.func private @triton.language.standard.zeros_like__fp32S1_4096S__(%input: tensor<1x4096xf32> loc("input"(#loc85))) -> tensor<1x4096xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc86) + tt.return %0 : tensor<1x4096xf32> loc(#loc87) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x4096xf32> loc(#loc88) + tt.return %1 : tensor<1x4096xf32> loc(#loc88) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%mean: tensor<1x4096xf32> loc("mean"(#loc89)), %m2: tensor<1x4096xf32> loc("m2"(#loc89)), %weight: tensor<1x4096xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({ + ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)): + %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90) + tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90) + }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90) + tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc93) + %2 = ub.poison : tensor<1xf32> loc(#loc93) + %3 = ub.poison : tensor<1xf32> loc(#loc93) + tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93) + } loc(#loc89) + tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} { + %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195) + %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196) + %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197) + %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197) + %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198) + %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199) + %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199) + %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100) + %1 = arith.addf %mean_1, %0 : f32 loc(#loc101) + %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102) + %3 = arith.mulf %delta, %delta : f32 loc(#loc103) + %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104) + %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105) + %6 = arith.addf %2, %5 : f32 loc(#loc106) + tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107) + ^bb1: // no predecessors + %7 = ub.poison : f32 loc(#loc108) + %8 = ub.poison : f32 loc(#loc108) + %9 = ub.poison : f32 loc(#loc108) + tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108) + } loc(#loc94) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":29:45) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":30:43) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":31:47) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:62) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:51) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:37) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:58) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:41) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:8) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":50:16) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:43) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":52:31) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":53:29) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:47) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:42) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:35) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:62) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":60:16) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":64:16) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":66:16) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:41) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:36) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:63) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4) +#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0) +#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31) +#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11) +#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4) +#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7) +#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46) +#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31) +#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24) +#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30) +#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34) +#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26) +#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39) +#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31) +#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22) +#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11) +#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4) +#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30) +#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11) +#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4) +#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11) +#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4) +#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11) +#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4) +#loc115 = loc("xnumel"(#loc1)) +#loc116 = loc("r0_numel"(#loc2)) +#loc117 = loc("xoffset"(#loc3)) +#loc118 = loc("xoffset"(#loc4)) +#loc119 = loc("xindex"(#loc5)) +#loc120 = loc("xindex"(#loc6)) +#loc121 = loc("xindex"(#loc7)) +#loc122 = loc("xmask"(#loc8)) +#loc123 = loc("r0_base"(#loc9)) +#loc124 = loc("r0_base"(#loc10)) +#loc125 = loc("tmp3_mean"(#loc11)) +#loc126 = loc("tmp3_m2"(#loc12)) +#loc127 = loc("tmp3_weight"(#loc13)) +#loc128 = loc("tmp3_mean"(#loc14)) +#loc129 = loc("r0_index"(#loc15)) +#loc130 = loc("r0_mask"(#loc16)) +#loc131 = loc("tmp0"(#loc17)) +#loc132 = loc("tmp0"(#loc18)) +#loc133 = loc("tmp0"(#loc19)) +#loc134 = loc("tmp0"(#loc20)) +#loc135 = loc("tmp0"(#loc21)) +#loc136 = loc("tmp0"(#loc22)) +#loc137 = loc("tmp3_mean"(#loc25)) +#loc138 = loc("tmp3_mean"(#loc26)) +#loc139 = loc("tmp3_m2"(#loc27)) +#loc140 = loc("tmp3_m2"(#loc28)) +#loc141 = loc("tmp3_weight"(#loc29)) +#loc142 = loc("tmp3_weight"(#loc30)) +#loc143 = loc("tmp3"(#loc33)) +#loc144 = loc("tmp7"(#loc34)) +#loc145 = loc("tmp8"(#loc35)) +#loc146 = loc("r0_index"(#loc37)) +#loc147 = loc("r0_mask"(#loc38)) +#loc148 = loc("tmp9"(#loc39)) +#loc149 = loc("tmp9"(#loc40)) +#loc150 = loc("tmp9"(#loc41)) +#loc151 = loc("tmp12"(#loc42)) +#loc152 = loc("tmp12"(#loc43)) +#loc153 = loc("tmp12"(#loc44)) +#loc154 = loc("tmp12"(#loc45)) +#loc155 = loc("tmp12"(#loc46)) +#loc156 = loc("tmp12"(#loc47)) +#loc157 = loc("tmp23"(#loc48)) +#loc158 = loc("tmp23"(#loc49)) +#loc159 = loc("tmp23"(#loc50)) +#loc160 = loc("tmp10"(#loc51)) +#loc161 = loc("tmp11"(#loc52)) +#loc162 = loc("tmp14"(#loc53)) +#loc163 = loc("tmp15"(#loc54)) +#loc164 = loc("tmp16"(#loc55)) +#loc165 = loc("tmp17"(#loc56)) +#loc166 = loc("tmp18"(#loc57)) +#loc167 = loc("tmp19"(#loc58)) +#loc168 = loc("tmp20"(#loc59)) +#loc169 = loc("tmp22"(#loc60)) +#loc170 = loc("tmp24"(#loc61)) +#loc176 = loc("new_weight"(#loc74)) +#loc177 = loc("new_m2"(#loc75)) +#loc178 = loc("delta"(#loc76)) +#loc179 = loc("new_weight"(#loc77)) +#loc180 = loc("new_mean"(#loc78)) +#loc181 = loc("new_mean"(#loc79)) +#loc182 = loc("new_m2"(#loc80)) +#loc183 = loc("new_m2"(#loc81)) +#loc184 = loc("new_m2"(#loc82)) +#loc195 = loc("delta"(#loc95)) +#loc196 = loc("new_weight"(#loc96)) +#loc197 = loc("w2_over_w"(#loc97)) +#loc198 = loc("w2_over_w"(#loc98)) +#loc199 = loc("w2_over_w"(#loc99)) +#loc200 = loc("tmp3_m2"(#loc128)) +#loc202 = loc("new_weight"(#loc176)) +#loc203 = loc("new_m2"(#loc177)) +#loc204 = loc("new_weight"(#loc179)) +#loc205 = loc("new_mean"(#loc181)) +#loc206 = loc("new_m2"(#loc184)) +#loc207 = loc("tmp3_weight"(#loc200)) diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..08b835b3f97fa08461779a0ccc9ff5ce21bc5b61 --- /dev/null +++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttgir @@ -0,0 +1,179 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0) +#loc1 = loc(unknown) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79) +#loc49 = loc("in_ptr0"(#loc)) +#loc50 = loc("in_ptr1"(#loc)) +#loc51 = loc("in_ptr2"(#loc)) +#loc52 = loc("out_ptr2"(#loc)) +#loc53 = loc("xnumel"(#loc)) +#loc54 = loc("r0_numel"(#loc)) +#loc68 = loc(callsite(#loc1 at #loc15)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1x4096xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16, #blocked> loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c2304_i32 = arith.constant 2304 : i32 loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc55) + %xmask = arith.cmpi slt, %xoffset, %c2304_i32 : i32 loc(#loc56) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc57) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4096xi32, #blocked> loc(#loc57) + %r0_mask = arith.cmpi slt, %r0_base_6, %cst : tensor<1x4096xi32, #blocked> loc(#loc58) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc59) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32, #blocked> loc(#loc92) + %tmp0_8 = arith.addi %r0_base_6, %tmp0_7 : tensor<1x4096xi32, #blocked> loc(#loc60) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc61) + %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc61) + %tmp0_11 = tt.splat %xmask : i1 -> tensor<1x4096xi1, #blocked> loc(#loc93) + %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x4096xi1, #blocked> loc(#loc62) + %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc63) + %tmp0_14 = arith.extf %tmp0_13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc64) + %tmp3_mean = arith.select %tmp0_12, %tmp0_14, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc65) + %tmp3_weight = arith.select %tmp0_12, %cst_5, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc66) + %0:3 = "tt.reduce"(%tmp3_mean, %cst_2, %tmp3_weight) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc15)), %arg7: f32 loc(callsite(#loc1 at #loc15)), %arg8: f32 loc(callsite(#loc1 at #loc15)), %arg9: f32 loc(callsite(#loc1 at #loc15)), %arg10: f32 loc(callsite(#loc1 at #loc15)), %arg11: f32 loc(callsite(#loc1 at #loc15))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc94) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc95) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc96) + %w2_over_w_24 = arith.divf %arg11, %new_weight : f32 loc(#loc97) + %w2_over_w_25 = arith.select %w2_over_w, %cst_1, %w2_over_w_24 : f32 loc(#loc98) + %4 = arith.mulf %delta, %w2_over_w_25 : f32 loc(#loc99) + %5 = arith.addf %arg6, %4 : f32 loc(#loc100) + %6 = arith.addf %arg7, %arg10 : f32 loc(#loc101) + %7 = arith.mulf %delta, %delta : f32 loc(#loc102) + %8 = arith.mulf %7, %arg8 : f32 loc(#loc103) + %9 = arith.mulf %8, %w2_over_w_25 : f32 loc(#loc104) + %10 = arith.addf %6, %9 : f32 loc(#loc105) + tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc67) + }) : (tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc67) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc74) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc75) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc76) + %tmp9_15 = tt.addptr %tmp9, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc76) + %tmp9_16 = tt.load %tmp9_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc77) + %tmp9_17 = arith.extf %tmp9_16 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc78) + %tmp12 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr, #blocked> loc(#loc79) + %tmp12_18 = arith.extf %tmp12 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc80) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc81) + %tmp23_19 = tt.addptr %tmp23, %r0_base_6 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc81) + %tmp23_20 = tt.load %tmp23_19, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr, #blocked> loc(#loc82) + %tmp23_21 = arith.extf %tmp23_20 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc83) + %tmp11 = arith.addf %tmp9_17, %cst_5 : tensor<1x4096xf32, #blocked> loc(#loc84) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc85) + %tmp14_22 = arith.subf %tmp12_18, %tmp14 : tensor<1x4096xf32, #blocked> loc(#loc85) + %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc86) + %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc87) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc88) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc89) + %tmp20_23 = arith.mulf %tmp14_22, %tmp20 : tensor<1x4096xf32, #blocked> loc(#loc89) + %tmp22 = arith.mulf %tmp11, %tmp20_23 : tensor<1x4096xf32, #blocked> loc(#loc90) + %tmp24 = arith.addf %tmp22, %tmp23_21 : tensor<1x4096xf32, #blocked> loc(#loc91) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr, #blocked> loc(#loc46) + %2 = tt.addptr %1, %tmp0_8 : tensor<1x4096x!tt.ptr, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc46) + %3 = arith.truncf %tmp24 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc47) + tt.store %2, %3, %tmp0_12 : tensor<1x4096x!tt.ptr, #blocked> loc(#loc47) + tt.return loc(#loc48) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66) +#loc14 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4) +#loc55 = loc("xoffset"(#loc2)) +#loc56 = loc("xmask"(#loc3)) +#loc57 = loc("r0_base"(#loc4)) +#loc58 = loc("r0_mask"(#loc5)) +#loc59 = loc("tmp0"(#loc6)) +#loc60 = loc("tmp0"(#loc7)) +#loc61 = loc("tmp0"(#loc8)) +#loc62 = loc("tmp0"(#loc9)) +#loc63 = loc("tmp0"(#loc10)) +#loc64 = loc("tmp0"(#loc11)) +#loc65 = loc("tmp3_mean"(#loc12)) +#loc66 = loc("tmp3_weight"(#loc13)) +#loc67 = loc(callsite(#loc14 at #loc15)) +#loc69 = loc("delta"(#loc16)) +#loc70 = loc("new_weight"(#loc17)) +#loc71 = loc("w2_over_w"(#loc18)) +#loc72 = loc("w2_over_w"(#loc19)) +#loc73 = loc("w2_over_w"(#loc20)) +#loc74 = loc("tmp3"(#loc28)) +#loc75 = loc("tmp7"(#loc29)) +#loc76 = loc("tmp9"(#loc30)) +#loc77 = loc("tmp9"(#loc31)) +#loc78 = loc("tmp9"(#loc32)) +#loc79 = loc("tmp12"(#loc33)) +#loc80 = loc("tmp12"(#loc34)) +#loc81 = loc("tmp23"(#loc35)) +#loc82 = loc("tmp23"(#loc36)) +#loc83 = loc("tmp23"(#loc37)) +#loc84 = loc("tmp11"(#loc38)) +#loc85 = loc("tmp14"(#loc39)) +#loc86 = loc("tmp16"(#loc40)) +#loc87 = loc("tmp18"(#loc41)) +#loc88 = loc("tmp19"(#loc42)) +#loc89 = loc("tmp20"(#loc43)) +#loc90 = loc("tmp22"(#loc44)) +#loc91 = loc("tmp24"(#loc45)) +#loc92 = loc(fused[#loc60, #loc59]) +#loc93 = loc(fused[#loc62, #loc56]) +#loc94 = loc(callsite(#loc69 at #loc67)) +#loc95 = loc(callsite(#loc70 at #loc67)) +#loc96 = loc(callsite(#loc71 at #loc67)) +#loc97 = loc(callsite(#loc72 at #loc67)) +#loc98 = loc(callsite(#loc73 at #loc67)) +#loc99 = loc(callsite(#loc21 at #loc67)) +#loc100 = loc(callsite(#loc22 at #loc67)) +#loc101 = loc(callsite(#loc23 at #loc67)) +#loc102 = loc(callsite(#loc24 at #loc67)) +#loc103 = loc(callsite(#loc25 at #loc67)) +#loc104 = loc(callsite(#loc26 at #loc67)) +#loc105 = loc(callsite(#loc27 at #loc67)) diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..32b961da3a7b69dd7829b992fb0b6950a2240263 --- /dev/null +++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttir @@ -0,0 +1,180 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0) +#loc1 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("in_ptr1"(#loc)) +#loc52 = loc("in_ptr2"(#loc)) +#loc53 = loc("out_ptr2"(#loc)) +#loc54 = loc("xnumel"(#loc)) +#loc55 = loc("r0_numel"(#loc)) +#loc57 = loc(callsite(#loc1 at #loc3)) +module { + tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %xmask = arith.constant 2304 : i32 loc(#loc56) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc57) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc1) + %cst_5 = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc58) + %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc56) + %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc59) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc60) + %r0_mask = arith.cmpi slt, %r0_base_7, %cst_5 : tensor<1x4096xi32> loc(#loc61) + %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc62) + %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32> loc(#loc94) + %tmp0_9 = arith.addi %r0_base_7, %tmp0_8 : tensor<1x4096xi32> loc(#loc63) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc64) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc64) + %tmp0_12 = tt.splat %xmask_6 : i1 -> tensor<1x4096xi1> loc(#loc95) + %tmp0_13 = arith.andi %r0_mask, %tmp0_12 : tensor<1x4096xi1> loc(#loc65) + %tmp0_14 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc66) + %tmp0_15 = arith.extf %tmp0_14 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc67) + %tmp3_mean = arith.select %tmp0_13, %tmp0_15, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc68) + %tmp3_weight = arith.select %tmp0_13, %cst_4, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc69) + %0:3 = "tt.reduce"(%tmp3_mean, %cst_0, %tmp3_weight) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc3)), %arg7: f32 loc(callsite(#loc1 at #loc3)), %arg8: f32 loc(callsite(#loc1 at #loc3)), %arg9: f32 loc(callsite(#loc1 at #loc3)), %arg10: f32 loc(callsite(#loc1 at #loc3)), %arg11: f32 loc(callsite(#loc1 at #loc3))): + %delta = arith.subf %arg9, %arg6 : f32 loc(#loc96) + %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc97) + %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc98) + %w2_over_w_25 = arith.divf %arg11, %new_weight : f32 loc(#loc99) + %w2_over_w_26 = arith.select %w2_over_w, %cst, %w2_over_w_25 : f32 loc(#loc100) + %4 = arith.mulf %delta, %w2_over_w_26 : f32 loc(#loc101) + %5 = arith.addf %arg6, %4 : f32 loc(#loc102) + %6 = arith.addf %arg7, %arg10 : f32 loc(#loc103) + %7 = arith.mulf %delta, %delta : f32 loc(#loc104) + %8 = arith.mulf %7, %arg8 : f32 loc(#loc105) + %9 = arith.mulf %8, %w2_over_w_26 : f32 loc(#loc106) + %10 = arith.addf %6, %9 : f32 loc(#loc107) + tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc70) + }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc70) + %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc76) + %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc77) + %tmp9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc78) + %tmp9_16 = tt.addptr %tmp9, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc78) + %tmp9_17 = tt.load %tmp9_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc79) + %tmp9_18 = arith.extf %tmp9_17 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc80) + %tmp12 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr> loc(#loc81) + %tmp12_19 = arith.extf %tmp12 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc82) + %tmp23 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc83) + %tmp23_20 = tt.addptr %tmp23, %r0_base_7 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc83) + %tmp23_21 = tt.load %tmp23_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr> loc(#loc84) + %tmp23_22 = arith.extf %tmp23_21 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc85) + %tmp11 = arith.addf %tmp9_18, %cst_4 : tensor<1x4096xf32> loc(#loc86) + %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc87) + %tmp14_23 = arith.subf %tmp12_19, %tmp14 : tensor<1x4096xf32> loc(#loc87) + %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc88) + %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc89) + %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc90) + %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc91) + %tmp20_24 = arith.mulf %tmp14_23, %tmp20 : tensor<1x4096xf32> loc(#loc91) + %tmp22 = arith.mulf %tmp11, %tmp20_24 : tensor<1x4096xf32> loc(#loc92) + %tmp24 = arith.addf %tmp22, %tmp23_22 : tensor<1x4096xf32> loc(#loc93) + %1 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x4096x!tt.ptr> loc(#loc47) + %2 = tt.addptr %1, %tmp0_9 : tensor<1x4096x!tt.ptr>, tensor<1x4096xi32> loc(#loc47) + %3 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc48) + tt.store %2, %3, %tmp0_13 : tensor<1x4096x!tt.ptr> loc(#loc48) + tt.return loc(#loc49) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:27) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66) +#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46) +#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28) +#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25) +#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15) +#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4) +#loc56 = loc("xmask"(#loc2)) +#loc58 = loc("xoffset"(#loc4)) +#loc59 = loc("r0_base"(#loc5)) +#loc60 = loc("r0_base"(#loc6)) +#loc61 = loc("r0_mask"(#loc7)) +#loc62 = loc("tmp0"(#loc8)) +#loc63 = loc("tmp0"(#loc9)) +#loc64 = loc("tmp0"(#loc10)) +#loc65 = loc("tmp0"(#loc11)) +#loc66 = loc("tmp0"(#loc12)) +#loc67 = loc("tmp0"(#loc13)) +#loc68 = loc("tmp3_mean"(#loc14)) +#loc69 = loc("tmp3_weight"(#loc15)) +#loc70 = loc(callsite(#loc16 at #loc3)) +#loc71 = loc("delta"(#loc17)) +#loc72 = loc("new_weight"(#loc18)) +#loc73 = loc("w2_over_w"(#loc19)) +#loc74 = loc("w2_over_w"(#loc20)) +#loc75 = loc("w2_over_w"(#loc21)) +#loc76 = loc("tmp3"(#loc29)) +#loc77 = loc("tmp7"(#loc30)) +#loc78 = loc("tmp9"(#loc31)) +#loc79 = loc("tmp9"(#loc32)) +#loc80 = loc("tmp9"(#loc33)) +#loc81 = loc("tmp12"(#loc34)) +#loc82 = loc("tmp12"(#loc35)) +#loc83 = loc("tmp23"(#loc36)) +#loc84 = loc("tmp23"(#loc37)) +#loc85 = loc("tmp23"(#loc38)) +#loc86 = loc("tmp11"(#loc39)) +#loc87 = loc("tmp14"(#loc40)) +#loc88 = loc("tmp16"(#loc41)) +#loc89 = loc("tmp18"(#loc42)) +#loc90 = loc("tmp19"(#loc43)) +#loc91 = loc("tmp20"(#loc44)) +#loc92 = loc("tmp22"(#loc45)) +#loc93 = loc("tmp24"(#loc46)) +#loc94 = loc(fused[#loc63, #loc62]) +#loc95 = loc(fused[#loc65, #loc56]) +#loc96 = loc(callsite(#loc71 at #loc70)) +#loc97 = loc(callsite(#loc72 at #loc70)) +#loc98 = loc(callsite(#loc73 at #loc70)) +#loc99 = loc(callsite(#loc74 at #loc70)) +#loc100 = loc(callsite(#loc75 at #loc70)) +#loc101 = loc(callsite(#loc22 at #loc70)) +#loc102 = loc(callsite(#loc23 at #loc70)) +#loc103 = loc(callsite(#loc24 at #loc70)) +#loc104 = loc(callsite(#loc25 at #loc70)) +#loc105 = loc(callsite(#loc26 at #loc70)) +#loc106 = loc(callsite(#loc27 at #loc70)) +#loc107 = loc(callsite(#loc28 at #loc70)) diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/__grp__triton_poi_fused_clone_0.json b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/__grp__triton_poi_fused_clone_0.json new file mode 100644 index 0000000000000000000000000000000000000000..00f5f4ae3b359a6f1e1a5caeeb65863755d07e03 --- /dev/null +++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/__grp__triton_poi_fused_clone_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_clone_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.source", "triton_poi_fused_clone_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttir", "triton_poi_fused_clone_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttgir", "triton_poi_fused_clone_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.llir", "triton_poi_fused_clone_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ptx", "triton_poi_fused_clone_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.cubin", "triton_poi_fused_clone_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.json"}} \ No newline at end of file diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.cubin b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..87b37bbcc3394d686d041ad40b534f5d75aeb67b Binary files /dev/null and b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.cubin differ diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.json b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3acd55742ff19652520a7886010c6d718f73b442 --- /dev/null +++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.json @@ -0,0 +1 @@ +{"hash": "a4acc9fcc6d6e9284a78e05a9a5734fb2f107375a86d28af0550fe3534e9f721", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_0"} \ No newline at end of file diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.llir b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..9037fc8452a615107e5eabb80b658b1305d18512 --- /dev/null +++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.llir @@ -0,0 +1,53 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_clone_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 10, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 3, !dbg !9 + %10 = and i32 %9, 1016, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = sext i32 %11 to i64, !dbg !11 + %13 = getelementptr bfloat, ptr addrspace(1) %0, i64 %12, !dbg !11 + %14 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %13) #2, !dbg !12 + %15 = extractvalue { i32, i32, i32, i32 } %14, 0, !dbg !12 + %16 = extractvalue { i32, i32, i32, i32 } %14, 1, !dbg !12 + %17 = extractvalue { i32, i32, i32, i32 } %14, 2, !dbg !12 + %18 = extractvalue { i32, i32, i32, i32 } %14, 3, !dbg !12 + %19 = getelementptr bfloat, ptr addrspace(1) %1, i64 %12, !dbg !13 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %15, i32 %16, i32 %17, i32 %18, ptr addrspace(1) %19) #2, !dbg !14 + ret void, !dbg !15 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_0", linkageName: "triton_poi_fused_clone_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 30, scope: !4) +!12 = !DILocation(line: 24, column: 35, scope: !4) +!13 = !DILocation(line: 25, column: 25, scope: !4) +!14 = !DILocation(line: 25, column: 36, scope: !4) +!15 = !DILocation(line: 25, column: 4, scope: !4) diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ptx b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..93822f4cf096bcd6c804817fa461da610da5f574 --- /dev/null +++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ptx @@ -0,0 +1,305 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_clone_0 // -- Begin function triton_poi_fused_clone_0 + // @triton_poi_fused_clone_0 +.visible .entry triton_poi_fused_clone_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_1, + .param .u32 triton_poi_fused_clone_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_4 +) +.reqntid 128 +{ + .reg .b32 %r<11>; + .reg .b64 %rd<6>; + .loc 1 18 0 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_poi_fused_clone_0_param_0]; + ld.param.b64 %rd4, [triton_poi_fused_clone_0_param_1]; +$L__tmp0: + .loc 1 20 28 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:20:28 + mov.u32 %r5, %ctaid.x; + .loc 1 20 33 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:20:33 + shl.b32 %r6, %r5, 10; + .loc 1 21 36 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:21:36 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 3; + and.b32 %r9, %r8, 1016; + .loc 1 21 23 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:21:23 + or.b32 %r10, %r9, %r6; + .loc 1 24 30 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:24:30 + mul.wide.s32 %rd5, %r10, 2; + add.s64 %rd1, %rd3, %rd5; + .loc 1 24 35 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:24:35 + // begin inline asm + mov.u32 %r1, 0x0; + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 25 25 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:25 + add.s64 %rd2, %rd4, %rd5; + .loc 1 25 36 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:36 + // begin inline asm + st.global.v4.b32 [ %rd2 + 0 ], { %r1, %r2, %r3, %r4 }; + // end inline asm + .loc 1 25 4 // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 99 +.b8 122 +.b8 103 +.b8 55 +.b8 116 +.b8 112 +.b8 105 +.b8 116 +.b8 117 +.b8 112 +.b8 114 +.b8 119 +.b8 103 +.b8 113 +.b8 112 +.b8 117 +.b8 97 +.b8 106 +.b8 122 +.b8 121 +.b8 50 +.b8 110 +.b8 121 +.b8 108 +.b8 102 +.b8 107 +.b8 52 +.b8 51 +.b8 109 +.b8 100 +.b8 111 +.b8 122 +.b8 100 +.b8 53 +.b8 118 +.b8 119 +.b8 111 +.b8 55 +.b8 55 +.b8 109 +.b8 117 +.b8 113 +.b8 51 +.b8 107 +.b8 111 +.b8 115 +.b8 112 +.b8 110 +.b8 102 +.b8 55 +.b8 98 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 99 +.b8 122 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.source b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.source new file mode 100644 index 0000000000000000000000000000000000000000..ad631583408510c4e259542b648d1722880a2a21 --- /dev/null +++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.source @@ -0,0 +1,48 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0) +#loc13 = loc("in_ptr0"(#loc)) +#loc14 = loc("out_ptr0"(#loc)) +#loc15 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 8388608 : i32 loc(#loc16) + %xoffset = tt.get_program_id x : i32 loc(#loc17) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc18) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc18) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc18) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc19) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc20) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc20) + %xmask = arith.constant true loc(#loc21) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc21) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc22) + %tmp0_7 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc22) + %tmp0_8 = tt.load %tmp0_7 : tensor<1024x!tt.ptr> loc(#loc23) + %tmp0_9 = arith.extf %tmp0_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc24) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc10) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc10) + %2 = arith.truncf %tmp0_9 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc11) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc11) + tt.return loc(#loc12) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4) +#loc16 = loc("xnumel"(#loc1)) +#loc17 = loc("xoffset"(#loc2)) +#loc18 = loc("xoffset"(#loc3)) +#loc19 = loc("xindex"(#loc4)) +#loc20 = loc("xindex"(#loc5)) +#loc21 = loc("xmask"(#loc6)) +#loc22 = loc("tmp0"(#loc7)) +#loc23 = loc("tmp0"(#loc8)) +#loc24 = loc("tmp0"(#loc9)) diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttgir b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..9b49825992999fe0c46d246301ba8e08cfc28f9a --- /dev/null +++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttgir @@ -0,0 +1,38 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0) +#loc11 = loc("in_ptr0"(#loc)) +#loc12 = loc("out_ptr0"(#loc)) +#loc13 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc14) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc15) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc16) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32, #blocked> loc(#loc17) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32, #blocked> loc(#loc17) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc18) + %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc18) + %tmp0_4 = tt.load %tmp0_3 : tensor<1024x!tt.ptr, #blocked> loc(#loc19) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc8) + tt.store %1, %tmp0_4 : tensor<1024x!tt.ptr, #blocked> loc(#loc9) + tt.return loc(#loc10) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4) +#loc14 = loc("xoffset"(#loc2)) +#loc15 = loc("xoffset"(#loc3)) +#loc16 = loc("xindex"(#loc4)) +#loc17 = loc("xindex"(#loc5)) +#loc18 = loc("tmp0"(#loc6)) +#loc19 = loc("tmp0"(#loc7)) diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttir b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..813c40eec78ac1de9d3c21ec4f88f82d75fbcb60 --- /dev/null +++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttir @@ -0,0 +1,37 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0) +#loc11 = loc("in_ptr0"(#loc)) +#loc12 = loc("out_ptr0"(#loc)) +#loc13 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc14) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc15) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc16) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc17) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc17) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc18) + %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc18) + %tmp0_4 = tt.load %tmp0_3 : tensor<1024x!tt.ptr> loc(#loc19) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc8) + tt.store %1, %tmp0_4 : tensor<1024x!tt.ptr> loc(#loc9) + tt.return loc(#loc10) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4) +#loc14 = loc("xoffset"(#loc2)) +#loc15 = loc("xoffset"(#loc3)) +#loc16 = loc("xindex"(#loc4)) +#loc17 = loc("xindex"(#loc5)) +#loc18 = loc("tmp0"(#loc6)) +#loc19 = loc("tmp0"(#loc7)) diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json new file mode 100644 index 0000000000000000000000000000000000000000..969cd04c5f2eaee47bca305c6d2987d3598325a9 --- /dev/null +++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json"}} \ No newline at end of file diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..66513dc3b9f64fab9f197ca2a7a014d12c89ef89 Binary files /dev/null and b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin differ diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a26038c12f363fbfc49a4a3aebd2d2203fbab244 --- /dev/null +++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json @@ -0,0 +1 @@ +{"hash": "aea8899f15de857c634775fd5e3edb6dfa29e1e98838d4cd56a55fb157dcc247", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3"} \ No newline at end of file diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..3ded8f8747fbb2bba62503c8536985d82d4ef70d --- /dev/null +++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir @@ -0,0 +1,201 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %11 = shl i32 %10, 9, !dbg !8 + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %13 = shl nuw nsw i32 %12, 1, !dbg !9 + %14 = and i32 %13, 510, !dbg !9 + %15 = or disjoint i32 %14, %11, !dbg !10 + %16 = or disjoint i32 %15, 1, !dbg !10 + %17 = sdiv i32 %15, 128, !dbg !11 + %18 = mul i32 %17, 128, !dbg !12 + %.decomposed = sub i32 %15, %18, !dbg !12 + %19 = srem i32 %16, 128, !dbg !12 + %20 = sdiv i32 %15, 4096, !dbg !13 + %21 = sext i32 %15 to i64, !dbg !14 + %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !14 + %23 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %22) #2, !dbg !15 + %24 = bitcast i32 %23 to <2 x bfloat>, !dbg !15 + %25 = shl nsw i32 %20, 7, !dbg !16 + %26 = add nsw i32 %25, %.decomposed, !dbg !17 + %27 = sext i32 %26 to i64, !dbg !18 + %28 = getelementptr float, ptr addrspace(1) %1, i64 %27, !dbg !18 + %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !19 + %30 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $2 + 0 ], $3;", "=r,=r,l,l"(ptr addrspace(1) %28, i64 %29) #2, !dbg !19 + %31 = extractvalue { i32, i32 } %30, 0, !dbg !19 + %32 = extractvalue { i32, i32 } %30, 1, !dbg !19 + %33 = getelementptr float, ptr addrspace(1) %2, i64 %27, !dbg !20 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !21 + %35 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $2 + 0 ], $3;", "=r,=r,l,l"(ptr addrspace(1) %33, i64 %34) #2, !dbg !21 + %36 = extractvalue { i32, i32 } %35, 0, !dbg !21 + %37 = extractvalue { i32, i32 } %35, 1, !dbg !21 + %38 = getelementptr bfloat, ptr addrspace(1) %3, i64 %21, !dbg !22 + %39 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %38) #2, !dbg !23 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !23 + %41 = srem i32 %16, 2, !dbg !24 + %42 = icmp slt i32 %41, 1, !dbg !25 + %.lhs.trunc = trunc nsw i32 %19 to i8, !dbg !26 + %43 = sdiv i8 %.lhs.trunc, 2, !dbg !26 + %.sext = sext i8 %43 to i32, !dbg !26 + %44 = shl nsw i32 %.sext, 1, !dbg !27 + %45 = or disjoint i32 %.decomposed, 1, !dbg !28 + %46 = shl nsw i32 %17, 7, !dbg !29 + %47 = add i32 %45, %46, !dbg !30 + %48 = or disjoint i32 %46, 1, !dbg !28 + %49 = add i32 %48, %44, !dbg !30 + %50 = sext i32 %47 to i64, !dbg !31 + %51 = getelementptr bfloat, ptr addrspace(1) %0, i64 %50, !dbg !31 + %52 = sext i32 %49 to i64, !dbg !31 + %53 = getelementptr bfloat, ptr addrspace(1) %0, i64 %52, !dbg !31 + %54 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32 + %55 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %51, i64 %54, i1 true) #2, !dbg !32 + %56 = bitcast i16 %55 to bfloat, !dbg !32 + %57 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32 + %58 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %53, i64 %57, i1 %42) #2, !dbg !32 + %59 = bitcast i16 %58 to bfloat, !dbg !32 + %60 = fpext bfloat %56 to float, !dbg !33 + %61 = fpext bfloat %59 to float, !dbg !33 + %62 = fsub float 0.000000e+00, %60, !dbg !34 + %63 = fsub float 0.000000e+00, %61, !dbg !34 + %64 = icmp sgt i32 %41, 0, !dbg !35 + %65 = add i32 %46, %.decomposed, !dbg !36 + %66 = add i32 %44, %46, !dbg !36 + %67 = sext i32 %65 to i64, !dbg !37 + %68 = getelementptr bfloat, ptr addrspace(1) %0, i64 %67, !dbg !37 + %69 = sext i32 %66 to i64, !dbg !37 + %70 = getelementptr bfloat, ptr addrspace(1) %0, i64 %69, !dbg !37 + %71 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38 + %72 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %68, i64 %71, i1 false) #2, !dbg !38 + %73 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38 + %74 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %70, i64 %73, i1 %64) #2, !dbg !38 + %75 = bitcast i16 %74 to bfloat, !dbg !38 + %76 = fpext bfloat %75 to float, !dbg !39 + %77 = select i1 %42, float %63, float %76, !dbg !40 + %78 = getelementptr bfloat, ptr addrspace(1) %3, i64 %50, !dbg !41 + %79 = getelementptr bfloat, ptr addrspace(1) %3, i64 %52, !dbg !41 + %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42 + %81 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %78, i64 %80, i1 true) #2, !dbg !42 + %82 = bitcast i16 %81 to bfloat, !dbg !42 + %83 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42 + %84 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %79, i64 %83, i1 %42) #2, !dbg !42 + %85 = bitcast i16 %84 to bfloat, !dbg !42 + %86 = fpext bfloat %82 to float, !dbg !43 + %87 = fpext bfloat %85 to float, !dbg !43 + %88 = fsub float 0.000000e+00, %86, !dbg !44 + %89 = fsub float 0.000000e+00, %87, !dbg !44 + %90 = getelementptr bfloat, ptr addrspace(1) %3, i64 %67, !dbg !45 + %91 = getelementptr bfloat, ptr addrspace(1) %3, i64 %69, !dbg !45 + %92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46 + %93 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %90, i64 %92, i1 false) #2, !dbg !46 + %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46 + %95 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %91, i64 %94, i1 %64) #2, !dbg !46 + %96 = bitcast i16 %95 to bfloat, !dbg !46 + %97 = fpext bfloat %96 to float, !dbg !47 + %98 = select i1 %42, float %89, float %97, !dbg !40 + %99 = getelementptr bfloat, ptr addrspace(1) %4, i64 %21, !dbg !48 + %100 = fpext <2 x bfloat> %24 to <2 x float>, !dbg !49 + %101 = insertelement <2 x i32> poison, i32 %31, i64 0, !dbg !19 + %102 = insertelement <2 x i32> %101, i32 %32, i64 1, !dbg !19 + %103 = bitcast <2 x i32> %102 to <2 x float>, !dbg !19 + %104 = insertelement <2 x i32> poison, i32 %36, i64 0, !dbg !21 + %105 = insertelement <2 x i32> %104, i32 %37, i64 1, !dbg !21 + %106 = bitcast <2 x i32> %105 to <2 x float>, !dbg !21 + %107 = fmul <2 x float> %100, %103, !dbg !50 + %108 = insertelement <2 x float> poison, float %62, i64 0, !dbg !51 + %109 = insertelement <2 x float> %108, float %77, i64 1, !dbg !51 + %110 = fmul <2 x float> %109, %106, !dbg !51 + %111 = fadd <2 x float> %107, %110, !dbg !52 + %112 = fptrunc <2 x float> %111 to <2 x bfloat>, !dbg !53 + %113 = bitcast <2 x bfloat> %112 to i32, !dbg !53 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %113, ptr addrspace(1) %99) #2, !dbg !53 + %114 = getelementptr bfloat, ptr addrspace(1) %5, i64 %21, !dbg !54 + %115 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !55 + %116 = fmul <2 x float> %103, %115, !dbg !56 + %117 = insertelement <2 x float> poison, float %88, i64 0, !dbg !57 + %118 = insertelement <2 x float> %117, float %98, i64 1, !dbg !57 + %119 = fmul <2 x float> %118, %106, !dbg !57 + %120 = fadd <2 x float> %116, %119, !dbg !58 + %121 = fptrunc <2 x float> %120 to <2 x bfloat>, !dbg !59 + %122 = bitcast <2 x bfloat> %121 to i32, !dbg !59 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %122, ptr addrspace(1) %114) #2, !dbg !59 + ret void, !dbg !60 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3", linkageName: "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 19, scope: !4) +!12 = !DILocation(line: 24, column: 19, scope: !4) +!13 = !DILocation(line: 25, column: 19, scope: !4) +!14 = !DILocation(line: 27, column: 30, scope: !4) +!15 = !DILocation(line: 27, column: 35, scope: !4) +!16 = !DILocation(line: 28, column: 39, scope: !4) +!17 = !DILocation(line: 28, column: 35, scope: !4) +!18 = !DILocation(line: 28, column: 30, scope: !4) +!19 = !DILocation(line: 28, column: 44, scope: !4) +!20 = !DILocation(line: 29, column: 31, scope: !4) +!21 = !DILocation(line: 29, column: 45, scope: !4) +!22 = !DILocation(line: 30, column: 31, scope: !4) +!23 = !DILocation(line: 30, column: 36, scope: !4) +!24 = !DILocation(line: 33, column: 17, scope: !4) +!25 = !DILocation(line: 37, column: 18, scope: !4) +!26 = !DILocation(line: 38, column: 43, scope: !4) +!27 = !DILocation(line: 38, column: 37, scope: !4) +!28 = !DILocation(line: 38, column: 34, scope: !4) +!29 = !DILocation(line: 38, column: 52, scope: !4) +!30 = !DILocation(line: 38, column: 48, scope: !4) +!31 = !DILocation(line: 38, column: 30, scope: !4) +!32 = !DILocation(line: 38, column: 57, scope: !4) +!33 = !DILocation(line: 38, column: 107, scope: !4) +!34 = !DILocation(line: 39, column: 13, scope: !4) +!35 = !DILocation(line: 42, column: 20, scope: !4) +!36 = !DILocation(line: 45, column: 45, scope: !4) +!37 = !DILocation(line: 45, column: 31, scope: !4) +!38 = !DILocation(line: 45, column: 54, scope: !4) +!39 = !DILocation(line: 45, column: 105, scope: !4) +!40 = !DILocation(line: 0, scope: !4) +!41 = !DILocation(line: 53, column: 31, scope: !4) +!42 = !DILocation(line: 53, column: 58, scope: !4) +!43 = !DILocation(line: 53, column: 108, scope: !4) +!44 = !DILocation(line: 54, column: 13, scope: !4) +!45 = !DILocation(line: 57, column: 31, scope: !4) +!46 = !DILocation(line: 57, column: 54, scope: !4) +!47 = !DILocation(line: 57, column: 105, scope: !4) +!48 = !DILocation(line: 63, column: 25, scope: !4) +!49 = !DILocation(line: 27, column: 44, scope: !4) +!50 = !DILocation(line: 32, column: 18, scope: !4) +!51 = !DILocation(line: 48, column: 20, scope: !4) +!52 = !DILocation(line: 49, column: 19, scope: !4) +!53 = !DILocation(line: 63, column: 37, scope: !4) +!54 = !DILocation(line: 64, column: 25, scope: !4) +!55 = !DILocation(line: 30, column: 45, scope: !4) +!56 = !DILocation(line: 52, column: 20, scope: !4) +!57 = !DILocation(line: 60, column: 20, scope: !4) +!58 = !DILocation(line: 61, column: 20, scope: !4) +!59 = !DILocation(line: 64, column: 37, scope: !4) +!60 = !DILocation(line: 64, column: 4, scope: !4) diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..caee9edec3e37d8739695db2fe64c2511513e0de --- /dev/null +++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx @@ -0,0 +1,517 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3 // -- Begin function triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3 + // @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3 +.visible .entry triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3( + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_5, + .param .u32 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_6, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_7, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_8 +) +.reqntid 256 +{ + .reg .pred %p<5>; + .reg .b16 %rs<20>; + .reg .b32 %r<60>; + .reg .b64 %rd<34>; + .loc 1 18 0 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:18:0 + +// %bb.0: + ld.param.b64 %rd23, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_0]; + ld.param.b64 %rd24, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_1]; +$L__tmp0: + .loc 1 20 28 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:20:28 + mov.u32 %r9, %ctaid.x; + .loc 1 20 33 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:20:33 + shl.b32 %r10, %r9, 9; + ld.param.b64 %rd25, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_2]; + ld.param.b64 %rd26, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_3]; + .loc 1 21 36 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:36 + mov.u32 %r11, %tid.x; + shl.b32 %r12, %r11, 1; + ld.param.b64 %rd27, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_4]; + and.b32 %r13, %r12, 510; + ld.param.b64 %rd28, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_5]; + .loc 1 21 23 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:23 + or.b32 %r14, %r13, %r10; + or.b32 %r15, %r14, 1; + .loc 1 26 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:26:19 + bfe.s32 %r16, %r9, 22, 1; + .loc 1 24 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:24:19 + shr.u32 %r17, %r16, 25; + .loc 1 26 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:26:19 + add.s32 %r18, %r14, %r17; + .loc 1 24 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:24:19 + and.b32 %r19, %r18, -128; + sub.s32 %r20, %r14, %r19; + add.s32 %r21, %r15, %r17; + and.b32 %r22, %r21, 65408; + sub.s32 %r23, %r15, %r22; + .loc 1 25 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:25:19 + shr.u32 %r24, %r16, 20; + add.s32 %r25, %r14, %r24; + shr.s32 %r26, %r25, 12; + .loc 1 27 30 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:30 + mul.wide.s32 %rd29, %r14, 2; + add.s64 %rd1, %rd23, %rd29; + .loc 1 27 35 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:35 + // begin inline asm + mov.u32 %r1, 0x0; + ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 28 39 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:39 + shl.b32 %r27, %r26, 7; + .loc 1 28 35 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:35 + add.s32 %r28, %r27, %r20; + .loc 1 28 30 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:30 + mul.wide.s32 %rd30, %r28, 4; + add.s64 %rd2, %rd24, %rd30; + .loc 1 28 44 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:44 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r2, %r3 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 29 31 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:29:31 + add.s64 %rd4, %rd25, %rd30; + .loc 1 29 45 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:29:45 + // begin inline asm + mov.u64 %rd5, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd5, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd4 + 0 ], %rd5; + // end inline asm + .loc 1 30 31 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:31 + add.s64 %rd6, %rd26, %rd29; + .loc 1 30 36 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:36 + // begin inline asm + mov.u32 %r6, 0x0; + ld.global.b32 { %r6 }, [ %rd6 + 0 ]; + // end inline asm + .loc 1 33 17 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:33:17 + bfe.u32 %r29, %r9, 22, 1; + add.s32 %r30, %r15, %r29; + and.b32 %r31, %r30, -2; + sub.s32 %r32, %r15, %r31; + .loc 1 37 18 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:37:18 + setp.lt.s32 %p2, %r32, 1; + .loc 1 38 43 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:43 + cvt.u16.u32 %rs10, %r23; + and.b16 %rs11, %rs10, 128; + shr.u16 %rs12, %rs11, 7; + add.s16 %rs13, %rs10, %rs12; + cvt.s16.s8 %rs14, %rs13; + shr.s16 %rs15, %rs14, 1; + .loc 1 38 48 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:48 + mad.wide.s16 %r33, %rs15, 2, %r19; + or.b32 %r34, %r33, 1; + .loc 1 38 30 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:30 + mul.wide.s32 %rd31, %r15, 2; + add.s64 %rd7, %rd23, %rd31; + mul.wide.s32 %rd32, %r34, 2; + add.s64 %rd9, %rd23, %rd32; + .loc 1 38 57 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:57 + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0; + // end inline asm + mov.b16 %rs2, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd7 + 0 ], %rd8; + // end inline asm + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd9 + 0 ], %rd10; + // end inline asm + .loc 1 38 107 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:107 + cvt.f32.bf16 %r35, %rs1; + cvt.f32.bf16 %r36, %rs3; + mov.b32 %r37, 0f00000000; + .loc 1 39 13 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:39:13 + sub.f32 %r38, %r37, %r35; + sub.f32 %r39, %r37, %r36; + .loc 1 42 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:42:20 + setp.gt.s32 %p4, %r32, 0; + .loc 1 45 31 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:31 + mul.wide.s32 %rd33, %r33, 2; + add.s64 %rd12, %rd23, %rd33; + .loc 1 45 54 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:54 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0; + // end inline asm + mov.pred %p3, 0; + // begin inline asm + mov.u16 %rs4, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd1 + 0 ], %rd11; + // end inline asm + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd12 + 0 ], %rd13; + // end inline asm + .loc 1 45 105 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:105 + cvt.f32.bf16 %r40, %rs5; + .loc 1 0 0 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:0 + selp.f32 %r41, %r39, %r40, %p2; + .loc 1 53 31 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:31 + add.s64 %rd14, %rd26, %rd31; + add.s64 %rd16, %rd26, %rd32; + .loc 1 53 58 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:58 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs6, %rs2; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd14 + 0 ], %rd15; + // end inline asm + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs7, %rs2; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd16 + 0 ], %rd17; + // end inline asm + .loc 1 53 108 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:108 + cvt.f32.bf16 %r42, %rs6; + cvt.f32.bf16 %r43, %rs7; + .loc 1 54 13 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:54:13 + sub.f32 %r44, %r37, %r42; + sub.f32 %r45, %r37, %r43; + .loc 1 57 31 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:31 + add.s64 %rd19, %rd26, %rd33; + .loc 1 57 54 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:54 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs8, %rs2; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd6 + 0 ], %rd18; + // end inline asm + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs9, %rs2; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd19 + 0 ], %rd20; + // end inline asm + .loc 1 57 105 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:105 + cvt.f32.bf16 %r46, %rs9; + .loc 1 0 0 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:0 + selp.f32 %r47, %r45, %r46, %p2; + .loc 1 63 25 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:25 + add.s64 %rd21, %rd27, %rd29; + .loc 1 27 44 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:44 + mov.b32 {%rs16, %rs17}, %r1; + cvt.f32.bf16 %r48, %rs16; + cvt.f32.bf16 %r49, %rs17; + .loc 1 48 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:48:20 + mul.f32 %r50, %r41, %r5; + mul.f32 %r51, %r38, %r4; + .loc 1 49 19 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:49:19 + fma.rn.f32 %r52, %r49, %r3, %r50; + fma.rn.f32 %r53, %r48, %r2, %r51; + .loc 1 63 37 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:37 + cvt.rn.bf16x2.f32 %r7, %r52, %r53; + // begin inline asm + st.global.b32 [ %rd21 + 0 ], { %r7 }; + // end inline asm + .loc 1 64 25 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:25 + add.s64 %rd22, %rd28, %rd29; + .loc 1 30 45 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:45 + mov.b32 {%rs18, %rs19}, %r6; + cvt.f32.bf16 %r54, %rs18; + cvt.f32.bf16 %r55, %rs19; + .loc 1 60 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:60:20 + mul.f32 %r56, %r47, %r5; + mul.f32 %r57, %r44, %r4; + .loc 1 61 20 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:61:20 + fma.rn.f32 %r58, %r3, %r55, %r56; + fma.rn.f32 %r59, %r2, %r54, %r57; + .loc 1 64 37 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:37 + cvt.rn.bf16x2.f32 %r8, %r58, %r59; + // begin inline asm + st.global.b32 [ %rd22 + 0 ], { %r8 }; + // end inline asm + .loc 1 64 4 // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 106 +.b8 54 +.b8 54 +.b8 116 +.b8 103 +.b8 98 +.b8 102 +.b8 113 +.b8 120 +.b8 55 +.b8 114 +.b8 104 +.b8 121 +.b8 116 +.b8 99 +.b8 121 +.b8 119 +.b8 109 +.b8 106 +.b8 100 +.b8 99 +.b8 105 +.b8 109 +.b8 110 +.b8 119 +.b8 119 +.b8 116 +.b8 113 +.b8 54 +.b8 120 +.b8 106 +.b8 103 +.b8 98 +.b8 50 +.b8 113 +.b8 98 +.b8 113 +.b8 98 +.b8 120 +.b8 120 +.b8 111 +.b8 110 +.b8 97 +.b8 108 +.b8 100 +.b8 111 +.b8 116 +.b8 120 +.b8 54 +.b8 51 +.b8 118 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 106 +.b8 54 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source new file mode 100644 index 0000000000000000000000000000000000000000..eed402848ce10edbf4fc6802fde8521e8b6422e4 --- /dev/null +++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source @@ -0,0 +1,352 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0) +#loc81 = loc("in_ptr0"(#loc)) +#loc82 = loc("in_ptr1"(#loc)) +#loc83 = loc("in_ptr2"(#loc)) +#loc84 = loc("in_ptr3"(#loc)) +#loc85 = loc("out_ptr0"(#loc)) +#loc86 = loc("out_ptr1"(#loc)) +#loc87 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc88) + %xoffset = tt.get_program_id x : i32 loc(#loc89) + %xoffset_1 = arith.constant 512 : i32 loc(#loc90) + %xoffset_2 = arith.constant 512 : i32 loc(#loc90) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc90) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc91) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc92) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc92) + %xmask = arith.constant true loc(#loc93) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc93) + %x0 = arith.constant 128 : i32 loc(#loc94) + %x0_7 = arith.constant 128 : i32 loc(#loc94) + %x0_8 = arith.constant dense<128> : tensor<512xi32> loc(#loc94) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc94) + %x2 = arith.constant 4096 : i32 loc(#loc95) + %x2_10 = arith.constant 4096 : i32 loc(#loc95) + %x2_11 = arith.constant dense<4096> : tensor<512xi32> loc(#loc95) + %x2_12 = arith.divsi %xindex_5, %x2_11 : tensor<512xi32> loc(#loc95) + %x4 = arith.constant 128 : i32 loc(#loc96) + %x4_13 = arith.constant 128 : i32 loc(#loc96) + %x4_14 = arith.constant dense<128> : tensor<512xi32> loc(#loc96) + %x4_15 = arith.divsi %xindex_5, %x4_14 : tensor<512xi32> loc(#loc96) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc97) + %tmp0_16 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc97) + %tmp0_17 = tt.load %tmp0_16 : tensor<512x!tt.ptr> loc(#loc98) + %tmp0_18 = arith.extf %tmp0_17 : tensor<512xbf16> to tensor<512xf32> loc(#loc99) + %tmp2 = arith.constant 128 : i32 loc(#loc100) + %tmp2_19 = arith.constant 128 : i32 loc(#loc100) + %tmp2_20 = arith.constant dense<128> : tensor<512xi32> loc(#loc100) + %tmp2_21 = arith.muli %tmp2_20, %x2_12 : tensor<512xi32> loc(#loc100) + %tmp2_22 = arith.addi %x0_9, %tmp2_21 : tensor<512xi32> loc(#loc101) + %tmp2_23 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc102) + %tmp2_24 = tt.addptr %tmp2_23, %tmp2_22 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc102) + %tmp2_25 = tt.load %tmp2_24 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc103) + %tmp19 = arith.constant 128 : i32 loc(#loc104) + %tmp19_26 = arith.constant 128 : i32 loc(#loc104) + %tmp19_27 = arith.constant dense<128> : tensor<512xi32> loc(#loc104) + %tmp19_28 = arith.muli %tmp19_27, %x2_12 : tensor<512xi32> loc(#loc104) + %tmp19_29 = arith.addi %x0_9, %tmp19_28 : tensor<512xi32> loc(#loc105) + %tmp19_30 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc106) + %tmp19_31 = tt.addptr %tmp19_30, %tmp19_29 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc106) + %tmp19_32 = tt.load %tmp19_31 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc107) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc108) + %tmp23_33 = tt.addptr %tmp23, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc108) + %tmp23_34 = tt.load %tmp23_33 : tensor<512x!tt.ptr> loc(#loc109) + %tmp23_35 = arith.extf %tmp23_34 : tensor<512xbf16> to tensor<512xf32> loc(#loc110) + %tmp3 = arith.mulf %tmp0_18, %tmp2_25 : tensor<512xf32> loc(#loc111) + %tmp4 = arith.constant 2 : i32 loc(#loc112) + %tmp4_36 = arith.constant 2 : i32 loc(#loc112) + %tmp4_37 = arith.constant dense<2> : tensor<512xi32> loc(#loc112) + %tmp4_38 = arith.remsi %xindex_5, %tmp4_37 : tensor<512xi32> loc(#loc112) + %tmp5 = arith.constant 0 : i64 loc(#loc113) + %tmp5_39 = arith.constant dense<0> : tensor<1xi64> loc(#loc113) + %tmp6 = arith.extsi %tmp4_38 : tensor<512xi32> to tensor<512xi64> loc(#loc114) + %tmp6_40 = arith.constant dense<0> : tensor<512xi64> loc(#loc114) + %tmp6_41 = arith.cmpi sge, %tmp6, %tmp6_40 : tensor<512xi64> loc(#loc114) + %tmp7 = arith.constant 1 : i64 loc(#loc115) + %tmp7_42 = arith.constant dense<1> : tensor<1xi64> loc(#loc115) + %tmp8 = arith.extsi %tmp4_38 : tensor<512xi32> to tensor<512xi64> loc(#loc116) + %tmp8_43 = arith.constant dense<1> : tensor<512xi64> loc(#loc116) + %tmp8_44 = arith.cmpi slt, %tmp8, %tmp8_43 : tensor<512xi64> loc(#loc116) + %tmp9 = arith.constant 2 : i32 loc(#loc117) + %tmp9_45 = arith.constant 2 : i32 loc(#loc117) + %tmp9_46 = arith.constant dense<2> : tensor<512xi32> loc(#loc117) + %tmp9_47 = arith.divsi %x0_9, %tmp9_46 : tensor<512xi32> loc(#loc117) + %tmp9_48 = arith.constant 2 : i32 loc(#loc118) + %tmp9_49 = arith.constant 2 : i32 loc(#loc118) + %tmp9_50 = arith.constant dense<2> : tensor<512xi32> loc(#loc118) + %tmp9_51 = arith.muli %tmp9_50, %tmp9_47 : tensor<512xi32> loc(#loc118) + %tmp9_52 = arith.constant 1 : i32 loc(#loc119) + %tmp9_53 = arith.constant 1 : i32 loc(#loc119) + %tmp9_54 = arith.constant dense<1> : tensor<512xi32> loc(#loc119) + %tmp9_55 = arith.addi %tmp9_54, %tmp9_51 : tensor<512xi32> loc(#loc119) + %tmp9_56 = arith.constant 128 : i32 loc(#loc120) + %tmp9_57 = arith.constant 128 : i32 loc(#loc120) + %tmp9_58 = arith.constant dense<128> : tensor<512xi32> loc(#loc120) + %tmp9_59 = arith.muli %tmp9_58, %x4_15 : tensor<512xi32> loc(#loc120) + %tmp9_60 = arith.addi %tmp9_55, %tmp9_59 : tensor<512xi32> loc(#loc121) + %tmp9_61 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc122) + %tmp9_62 = tt.addptr %tmp9_61, %tmp9_60 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc122) + %tmp9_63 = arith.constant 0.000000e+00 : f32 loc(#loc123) + %tmp9_64 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc123) + %tmp9_65 = arith.truncf %tmp9_64 : tensor<512xf32> to tensor<512xbf16> loc(#loc123) + %tmp9_66 = tt.load %tmp9_62, %tmp8_44, %tmp9_65 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc123) + %tmp9_67 = arith.extf %tmp9_66 : tensor<512xbf16> to tensor<512xf32> loc(#loc124) + %tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc125) + %tmp10_68 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc125) + %tmp10_69 = arith.subf %tmp10_68, %tmp9_67 : tensor<512xf32> loc(#loc125) + %tmp11 = arith.constant 0.000000e+00 : f32 loc(#loc126) + %tmp11_70 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc126) + %tmp12 = arith.select %tmp8_44, %tmp10_69, %tmp11_70 : tensor<512xi1>, tensor<512xf32> loc(#loc127) + %tmp13 = arith.extsi %tmp4_38 : tensor<512xi32> to tensor<512xi64> loc(#loc128) + %tmp13_71 = arith.constant dense<1> : tensor<512xi64> loc(#loc128) + %tmp13_72 = arith.cmpi sge, %tmp13, %tmp13_71 : tensor<512xi64> loc(#loc128) + %tmp14 = arith.constant 2 : i64 loc(#loc129) + %tmp14_73 = arith.constant dense<2> : tensor<1xi64> loc(#loc129) + %tmp15 = arith.extsi %tmp4_38 : tensor<512xi32> to tensor<512xi64> loc(#loc130) + %tmp15_74 = arith.constant dense<2> : tensor<512xi64> loc(#loc130) + %tmp15_75 = arith.cmpi slt, %tmp15, %tmp15_74 : tensor<512xi64> loc(#loc130) + %tmp16 = arith.constant 2 : i32 loc(#loc131) + %tmp16_76 = arith.constant 2 : i32 loc(#loc131) + %tmp16_77 = arith.constant dense<2> : tensor<512xi32> loc(#loc131) + %tmp16_78 = arith.divsi %x0_9, %tmp16_77 : tensor<512xi32> loc(#loc131) + %tmp16_79 = arith.constant 2 : i32 loc(#loc132) + %tmp16_80 = arith.constant 2 : i32 loc(#loc132) + %tmp16_81 = arith.constant dense<2> : tensor<512xi32> loc(#loc132) + %tmp16_82 = arith.muli %tmp16_81, %tmp16_78 : tensor<512xi32> loc(#loc132) + %tmp16_83 = arith.constant 128 : i32 loc(#loc133) + %tmp16_84 = arith.constant 128 : i32 loc(#loc133) + %tmp16_85 = arith.constant dense<128> : tensor<512xi32> loc(#loc133) + %tmp16_86 = arith.muli %tmp16_85, %x4_15 : tensor<512xi32> loc(#loc133) + %tmp16_87 = arith.addi %tmp16_82, %tmp16_86 : tensor<512xi32> loc(#loc134) + %tmp16_88 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc135) + %tmp16_89 = tt.addptr %tmp16_88, %tmp16_87 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc135) + %tmp16_90 = arith.constant 0.000000e+00 : f32 loc(#loc136) + %tmp16_91 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc136) + %tmp16_92 = arith.truncf %tmp16_91 : tensor<512xf32> to tensor<512xbf16> loc(#loc136) + %tmp16_93 = tt.load %tmp16_89, %tmp13_72, %tmp16_92 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc136) + %tmp16_94 = arith.extf %tmp16_93 : tensor<512xbf16> to tensor<512xf32> loc(#loc137) + %tmp17 = arith.select %tmp8_44, %tmp12, %tmp16_94 : tensor<512xi1>, tensor<512xf32> loc(#loc138) + %tmp20 = arith.mulf %tmp17, %tmp19_32 : tensor<512xf32> loc(#loc139) + %tmp21 = arith.addf %tmp3, %tmp20 : tensor<512xf32> loc(#loc140) + %tmp25 = arith.mulf %tmp23_35, %tmp2_25 : tensor<512xf32> loc(#loc141) + %tmp26 = arith.constant 2 : i32 loc(#loc142) + %tmp26_95 = arith.constant 2 : i32 loc(#loc142) + %tmp26_96 = arith.constant dense<2> : tensor<512xi32> loc(#loc142) + %tmp26_97 = arith.divsi %x0_9, %tmp26_96 : tensor<512xi32> loc(#loc142) + %tmp26_98 = arith.constant 2 : i32 loc(#loc143) + %tmp26_99 = arith.constant 2 : i32 loc(#loc143) + %tmp26_100 = arith.constant dense<2> : tensor<512xi32> loc(#loc143) + %tmp26_101 = arith.muli %tmp26_100, %tmp26_97 : tensor<512xi32> loc(#loc143) + %tmp26_102 = arith.constant 1 : i32 loc(#loc144) + %tmp26_103 = arith.constant 1 : i32 loc(#loc144) + %tmp26_104 = arith.constant dense<1> : tensor<512xi32> loc(#loc144) + %tmp26_105 = arith.addi %tmp26_104, %tmp26_101 : tensor<512xi32> loc(#loc144) + %tmp26_106 = arith.constant 128 : i32 loc(#loc145) + %tmp26_107 = arith.constant 128 : i32 loc(#loc145) + %tmp26_108 = arith.constant dense<128> : tensor<512xi32> loc(#loc145) + %tmp26_109 = arith.muli %tmp26_108, %x4_15 : tensor<512xi32> loc(#loc145) + %tmp26_110 = arith.addi %tmp26_105, %tmp26_109 : tensor<512xi32> loc(#loc146) + %tmp26_111 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc147) + %tmp26_112 = tt.addptr %tmp26_111, %tmp26_110 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc147) + %tmp26_113 = arith.constant 0.000000e+00 : f32 loc(#loc148) + %tmp26_114 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc148) + %tmp26_115 = arith.truncf %tmp26_114 : tensor<512xf32> to tensor<512xbf16> loc(#loc148) + %tmp26_116 = tt.load %tmp26_112, %tmp8_44, %tmp26_115 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc148) + %tmp26_117 = arith.extf %tmp26_116 : tensor<512xbf16> to tensor<512xf32> loc(#loc149) + %tmp27 = arith.constant 0.000000e+00 : f32 loc(#loc150) + %tmp27_118 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc150) + %tmp27_119 = arith.subf %tmp27_118, %tmp26_117 : tensor<512xf32> loc(#loc150) + %tmp28 = arith.constant 0.000000e+00 : f32 loc(#loc151) + %tmp28_120 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc151) + %tmp29 = arith.select %tmp8_44, %tmp27_119, %tmp28_120 : tensor<512xi1>, tensor<512xf32> loc(#loc152) + %tmp30 = arith.constant 2 : i32 loc(#loc153) + %tmp30_121 = arith.constant 2 : i32 loc(#loc153) + %tmp30_122 = arith.constant dense<2> : tensor<512xi32> loc(#loc153) + %tmp30_123 = arith.divsi %x0_9, %tmp30_122 : tensor<512xi32> loc(#loc153) + %tmp30_124 = arith.constant 2 : i32 loc(#loc154) + %tmp30_125 = arith.constant 2 : i32 loc(#loc154) + %tmp30_126 = arith.constant dense<2> : tensor<512xi32> loc(#loc154) + %tmp30_127 = arith.muli %tmp30_126, %tmp30_123 : tensor<512xi32> loc(#loc154) + %tmp30_128 = arith.constant 128 : i32 loc(#loc155) + %tmp30_129 = arith.constant 128 : i32 loc(#loc155) + %tmp30_130 = arith.constant dense<128> : tensor<512xi32> loc(#loc155) + %tmp30_131 = arith.muli %tmp30_130, %x4_15 : tensor<512xi32> loc(#loc155) + %tmp30_132 = arith.addi %tmp30_127, %tmp30_131 : tensor<512xi32> loc(#loc156) + %tmp30_133 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc157) + %tmp30_134 = tt.addptr %tmp30_133, %tmp30_132 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc157) + %tmp30_135 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp30_136 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc158) + %tmp30_137 = arith.truncf %tmp30_136 : tensor<512xf32> to tensor<512xbf16> loc(#loc158) + %tmp30_138 = tt.load %tmp30_134, %tmp13_72, %tmp30_137 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc158) + %tmp30_139 = arith.extf %tmp30_138 : tensor<512xbf16> to tensor<512xf32> loc(#loc159) + %tmp31 = arith.select %tmp8_44, %tmp29, %tmp30_139 : tensor<512xi1>, tensor<512xf32> loc(#loc160) + %tmp33 = arith.mulf %tmp31, %tmp19_32 : tensor<512xf32> loc(#loc161) + %tmp34 = arith.addf %tmp25, %tmp33 : tensor<512xf32> loc(#loc162) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc76) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc76) + %2 = arith.truncf %tmp21 : tensor<512xf32> to tensor<512xbf16> loc(#loc77) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc77) + %3 = tt.splat %out_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc78) + %4 = tt.addptr %3, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc78) + %5 = arith.truncf %tmp34 : tensor<512xf32> to tensor<512xbf16> loc(#loc79) + tt.store %4, %5 : tensor<512x!tt.ptr> loc(#loc79) + tt.return loc(#loc80) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:40) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:36) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":34:27) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":35:19) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":36:27) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":40:38) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":43:28) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":44:19) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:40) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:34) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:49) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:44) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:38) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:35) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:53) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:49) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":55:38) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:40) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:34) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:49) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:45) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4) +#loc88 = loc("xnumel"(#loc1)) +#loc89 = loc("xoffset"(#loc2)) +#loc90 = loc("xoffset"(#loc3)) +#loc91 = loc("xindex"(#loc4)) +#loc92 = loc("xindex"(#loc5)) +#loc93 = loc("xmask"(#loc6)) +#loc94 = loc("x0"(#loc7)) +#loc95 = loc("x2"(#loc8)) +#loc96 = loc("x4"(#loc9)) +#loc97 = loc("tmp0"(#loc10)) +#loc98 = loc("tmp0"(#loc11)) +#loc99 = loc("tmp0"(#loc12)) +#loc100 = loc("tmp2"(#loc13)) +#loc101 = loc("tmp2"(#loc14)) +#loc102 = loc("tmp2"(#loc15)) +#loc103 = loc("tmp2"(#loc16)) +#loc104 = loc("tmp19"(#loc17)) +#loc105 = loc("tmp19"(#loc18)) +#loc106 = loc("tmp19"(#loc19)) +#loc107 = loc("tmp19"(#loc20)) +#loc108 = loc("tmp23"(#loc21)) +#loc109 = loc("tmp23"(#loc22)) +#loc110 = loc("tmp23"(#loc23)) +#loc111 = loc("tmp3"(#loc24)) +#loc112 = loc("tmp4"(#loc25)) +#loc113 = loc("tmp5"(#loc26)) +#loc114 = loc("tmp6"(#loc27)) +#loc115 = loc("tmp7"(#loc28)) +#loc116 = loc("tmp8"(#loc29)) +#loc117 = loc("tmp9"(#loc30)) +#loc118 = loc("tmp9"(#loc31)) +#loc119 = loc("tmp9"(#loc32)) +#loc120 = loc("tmp9"(#loc33)) +#loc121 = loc("tmp9"(#loc34)) +#loc122 = loc("tmp9"(#loc35)) +#loc123 = loc("tmp9"(#loc36)) +#loc124 = loc("tmp9"(#loc37)) +#loc125 = loc("tmp10"(#loc38)) +#loc126 = loc("tmp11"(#loc39)) +#loc127 = loc("tmp12"(#loc40)) +#loc128 = loc("tmp13"(#loc41)) +#loc129 = loc("tmp14"(#loc42)) +#loc130 = loc("tmp15"(#loc43)) +#loc131 = loc("tmp16"(#loc44)) +#loc132 = loc("tmp16"(#loc45)) +#loc133 = loc("tmp16"(#loc46)) +#loc134 = loc("tmp16"(#loc47)) +#loc135 = loc("tmp16"(#loc48)) +#loc136 = loc("tmp16"(#loc49)) +#loc137 = loc("tmp16"(#loc50)) +#loc138 = loc("tmp17"(#loc51)) +#loc139 = loc("tmp20"(#loc52)) +#loc140 = loc("tmp21"(#loc53)) +#loc141 = loc("tmp25"(#loc54)) +#loc142 = loc("tmp26"(#loc55)) +#loc143 = loc("tmp26"(#loc56)) +#loc144 = loc("tmp26"(#loc57)) +#loc145 = loc("tmp26"(#loc58)) +#loc146 = loc("tmp26"(#loc59)) +#loc147 = loc("tmp26"(#loc60)) +#loc148 = loc("tmp26"(#loc61)) +#loc149 = loc("tmp26"(#loc62)) +#loc150 = loc("tmp27"(#loc63)) +#loc151 = loc("tmp28"(#loc64)) +#loc152 = loc("tmp29"(#loc65)) +#loc153 = loc("tmp30"(#loc66)) +#loc154 = loc("tmp30"(#loc67)) +#loc155 = loc("tmp30"(#loc68)) +#loc156 = loc("tmp30"(#loc69)) +#loc157 = loc("tmp30"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp30"(#loc72)) +#loc160 = loc("tmp31"(#loc73)) +#loc161 = loc("tmp33"(#loc74)) +#loc162 = loc("tmp34"(#loc75)) diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..d24e57683476b87db609db1f92bd32e40635f906 --- /dev/null +++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir @@ -0,0 +1,198 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("in_ptr1"(#loc)) +#loc61 = loc("in_ptr2"(#loc)) +#loc62 = loc("in_ptr3"(#loc)) +#loc63 = loc("out_ptr0"(#loc)) +#loc64 = loc("out_ptr1"(#loc)) +#loc65 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<512xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<512xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<2> : tensor<512xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<512xi32, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc66) + %xoffset_6 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc67) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc68) + %xindex_7 = tt.splat %xoffset_6 : i32 -> tensor<512xi32, #blocked> loc(#loc69) + %xindex_8 = arith.addi %xindex_7, %xindex : tensor<512xi32, #blocked> loc(#loc69) + %x0 = arith.remsi %xindex_8, %cst_3 : tensor<512xi32, #blocked> loc(#loc70) + %x2 = arith.divsi %xindex_8, %cst_2 : tensor<512xi32, #blocked> loc(#loc71) + %x4 = arith.divsi %xindex_8, %cst_3 : tensor<512xi32, #blocked> loc(#loc72) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc73) + %tmp0_9 = tt.addptr %tmp0, %xindex_8 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc73) + %tmp0_10 = tt.load %tmp0_9 : tensor<512x!tt.ptr, #blocked> loc(#loc74) + %tmp0_11 = arith.extf %tmp0_10 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc75) + %tmp2 = arith.muli %x2, %cst_3 : tensor<512xi32, #blocked> loc(#loc76) + %tmp2_12 = arith.addi %x0, %tmp2 : tensor<512xi32, #blocked> loc(#loc77) + %tmp2_13 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc78) + %tmp2_14 = tt.addptr %tmp2_13, %tmp2_12 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc78) + %tmp2_15 = tt.load %tmp2_14 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc79) + %tmp19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc80) + %tmp19_16 = tt.addptr %tmp19, %tmp2_12 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc80) + %tmp19_17 = tt.load %tmp19_16 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc81) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc82) + %tmp23_18 = tt.addptr %tmp23, %xindex_8 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc82) + %tmp23_19 = tt.load %tmp23_18 : tensor<512x!tt.ptr, #blocked> loc(#loc83) + %tmp23_20 = arith.extf %tmp23_19 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc84) + %tmp3 = arith.mulf %tmp0_11, %tmp2_15 : tensor<512xf32, #blocked> loc(#loc85) + %tmp4 = arith.remsi %xindex_8, %cst_1 : tensor<512xi32, #blocked> loc(#loc86) + %tmp8 = arith.extsi %tmp4 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc87) + %tmp8_21 = arith.cmpi slt, %tmp8, %cst_0 : tensor<512xi64, #blocked> loc(#loc87) + %tmp9 = arith.divsi %x0, %cst_1 : tensor<512xi32, #blocked> loc(#loc88) + %tmp9_22 = arith.muli %tmp9, %cst_1 : tensor<512xi32, #blocked> loc(#loc89) + %tmp9_23 = arith.addi %tmp9_22, %cst : tensor<512xi32, #blocked> loc(#loc90) + %tmp9_24 = arith.muli %x4, %cst_3 : tensor<512xi32, #blocked> loc(#loc91) + %tmp9_25 = arith.addi %tmp9_23, %tmp9_24 : tensor<512xi32, #blocked> loc(#loc92) + %tmp9_26 = tt.addptr %tmp0, %tmp9_25 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc93) + %tmp9_27 = tt.load %tmp9_26, %tmp8_21, %cst_4 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc94) + %tmp9_28 = arith.extf %tmp9_27 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc95) + %tmp10 = arith.subf %cst_5, %tmp9_28 : tensor<512xf32, #blocked> loc(#loc96) + %tmp13 = arith.cmpi sge, %tmp8, %cst_0 : tensor<512xi64, #blocked> loc(#loc97) + %tmp16 = arith.addi %tmp9_22, %tmp9_24 : tensor<512xi32, #blocked> loc(#loc98) + %tmp16_29 = tt.addptr %tmp0, %tmp16 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc99) + %tmp16_30 = tt.load %tmp16_29, %tmp13, %cst_4 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc100) + %tmp16_31 = arith.extf %tmp16_30 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc101) + %tmp17 = arith.select %tmp8_21, %tmp10, %tmp16_31 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc118) + %tmp20 = arith.mulf %tmp17, %tmp19_17 : tensor<512xf32, #blocked> loc(#loc104) + %tmp21 = arith.addf %tmp3, %tmp20 : tensor<512xf32, #blocked> loc(#loc105) + %tmp25 = arith.mulf %tmp23_20, %tmp2_15 : tensor<512xf32, #blocked> loc(#loc106) + %tmp26 = tt.addptr %tmp23, %tmp9_25 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc107) + %tmp26_32 = tt.load %tmp26, %tmp8_21, %cst_4 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc108) + %tmp26_33 = arith.extf %tmp26_32 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc109) + %tmp27 = arith.subf %cst_5, %tmp26_33 : tensor<512xf32, #blocked> loc(#loc110) + %tmp30 = tt.addptr %tmp23, %tmp16 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc111) + %tmp30_34 = tt.load %tmp30, %tmp13, %cst_4 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc112) + %tmp30_35 = arith.extf %tmp30_34 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc113) + %tmp31 = arith.select %tmp8_21, %tmp27, %tmp30_35 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc119) + %tmp33 = arith.mulf %tmp31, %tmp19_17 : tensor<512xf32, #blocked> loc(#loc116) + %tmp34 = arith.addf %tmp25, %tmp33 : tensor<512xf32, #blocked> loc(#loc117) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc54) + %1 = tt.addptr %0, %xindex_8 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc54) + %2 = arith.truncf %tmp21 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc55) + tt.store %1, %2 : tensor<512x!tt.ptr, #blocked> loc(#loc55) + %3 = tt.splat %out_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc56) + %4 = tt.addptr %3, %xindex_8 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc56) + %5 = arith.truncf %tmp34 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc57) + tt.store %4, %5 : tensor<512x!tt.ptr, #blocked> loc(#loc57) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4) +#loc66 = loc("xoffset"(#loc2)) +#loc67 = loc("xoffset"(#loc3)) +#loc68 = loc("xindex"(#loc4)) +#loc69 = loc("xindex"(#loc5)) +#loc70 = loc("x0"(#loc6)) +#loc71 = loc("x2"(#loc7)) +#loc72 = loc("x4"(#loc8)) +#loc73 = loc("tmp0"(#loc9)) +#loc74 = loc("tmp0"(#loc10)) +#loc75 = loc("tmp0"(#loc11)) +#loc76 = loc("tmp2"(#loc12)) +#loc77 = loc("tmp2"(#loc13)) +#loc78 = loc("tmp2"(#loc14)) +#loc79 = loc("tmp2"(#loc15)) +#loc80 = loc("tmp19"(#loc16)) +#loc81 = loc("tmp19"(#loc17)) +#loc82 = loc("tmp23"(#loc18)) +#loc83 = loc("tmp23"(#loc19)) +#loc84 = loc("tmp23"(#loc20)) +#loc85 = loc("tmp3"(#loc21)) +#loc86 = loc("tmp4"(#loc22)) +#loc87 = loc("tmp8"(#loc23)) +#loc88 = loc("tmp9"(#loc24)) +#loc89 = loc("tmp9"(#loc25)) +#loc90 = loc("tmp9"(#loc26)) +#loc91 = loc("tmp9"(#loc27)) +#loc92 = loc("tmp9"(#loc28)) +#loc93 = loc("tmp9"(#loc29)) +#loc94 = loc("tmp9"(#loc30)) +#loc95 = loc("tmp9"(#loc31)) +#loc96 = loc("tmp10"(#loc32)) +#loc97 = loc("tmp13"(#loc33)) +#loc98 = loc("tmp16"(#loc34)) +#loc99 = loc("tmp16"(#loc35)) +#loc100 = loc("tmp16"(#loc36)) +#loc101 = loc("tmp16"(#loc37)) +#loc102 = loc("tmp17"(#loc38)) +#loc103 = loc("tmp12"(#loc39)) +#loc104 = loc("tmp20"(#loc40)) +#loc105 = loc("tmp21"(#loc41)) +#loc106 = loc("tmp25"(#loc42)) +#loc107 = loc("tmp26"(#loc43)) +#loc108 = loc("tmp26"(#loc44)) +#loc109 = loc("tmp26"(#loc45)) +#loc110 = loc("tmp27"(#loc46)) +#loc111 = loc("tmp30"(#loc47)) +#loc112 = loc("tmp30"(#loc48)) +#loc113 = loc("tmp30"(#loc49)) +#loc114 = loc("tmp31"(#loc50)) +#loc115 = loc("tmp29"(#loc51)) +#loc116 = loc("tmp33"(#loc52)) +#loc117 = loc("tmp34"(#loc53)) +#loc118 = loc(fused[#loc102, #loc103]) +#loc119 = loc(fused[#loc114, #loc115]) diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..bc0af5f2caf34db5e98fb28c624b801c691e76c0 --- /dev/null +++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir @@ -0,0 +1,197 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("in_ptr1"(#loc)) +#loc61 = loc("in_ptr2"(#loc)) +#loc62 = loc("in_ptr3"(#loc)) +#loc63 = loc("out_ptr0"(#loc)) +#loc64 = loc("out_ptr1"(#loc)) +#loc65 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<512xi32> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<512xi64> loc(#loc1) + %cst_3 = arith.constant dense<2> : tensor<512xi32> loc(#loc1) + %x2 = arith.constant dense<4096> : tensor<512xi32> loc(#loc66) + %cst_4 = arith.constant dense<128> : tensor<512xi32> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc67) + %xoffset_5 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc68) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc69) + %xindex_6 = tt.splat %xoffset_5 : i32 -> tensor<512xi32> loc(#loc70) + %xindex_7 = arith.addi %xindex_6, %xindex : tensor<512xi32> loc(#loc70) + %x0 = arith.remsi %xindex_7, %cst_4 : tensor<512xi32> loc(#loc71) + %x2_8 = arith.divsi %xindex_7, %x2 : tensor<512xi32> loc(#loc66) + %x4 = arith.divsi %xindex_7, %cst_4 : tensor<512xi32> loc(#loc72) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc73) + %tmp0_9 = tt.addptr %tmp0, %xindex_7 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc73) + %tmp0_10 = tt.load %tmp0_9 : tensor<512x!tt.ptr> loc(#loc74) + %tmp0_11 = arith.extf %tmp0_10 : tensor<512xbf16> to tensor<512xf32> loc(#loc75) + %tmp2 = arith.muli %x2_8, %cst_4 : tensor<512xi32> loc(#loc76) + %tmp2_12 = arith.addi %x0, %tmp2 : tensor<512xi32> loc(#loc77) + %tmp2_13 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc78) + %tmp2_14 = tt.addptr %tmp2_13, %tmp2_12 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc78) + %tmp2_15 = tt.load %tmp2_14 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc79) + %tmp19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc80) + %tmp19_16 = tt.addptr %tmp19, %tmp2_12 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc80) + %tmp19_17 = tt.load %tmp19_16 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc81) + %tmp23 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc82) + %tmp23_18 = tt.addptr %tmp23, %xindex_7 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc82) + %tmp23_19 = tt.load %tmp23_18 : tensor<512x!tt.ptr> loc(#loc83) + %tmp23_20 = arith.extf %tmp23_19 : tensor<512xbf16> to tensor<512xf32> loc(#loc84) + %tmp3 = arith.mulf %tmp0_11, %tmp2_15 : tensor<512xf32> loc(#loc85) + %tmp4 = arith.remsi %xindex_7, %cst_3 : tensor<512xi32> loc(#loc86) + %tmp8 = arith.extsi %tmp4 : tensor<512xi32> to tensor<512xi64> loc(#loc87) + %tmp8_21 = arith.cmpi slt, %tmp8, %cst_2 : tensor<512xi64> loc(#loc87) + %tmp9 = arith.divsi %x0, %cst_3 : tensor<512xi32> loc(#loc88) + %tmp9_22 = arith.muli %tmp9, %cst_3 : tensor<512xi32> loc(#loc89) + %tmp9_23 = arith.addi %tmp9_22, %cst_1 : tensor<512xi32> loc(#loc90) + %tmp9_24 = arith.muli %x4, %cst_4 : tensor<512xi32> loc(#loc91) + %tmp9_25 = arith.addi %tmp9_23, %tmp9_24 : tensor<512xi32> loc(#loc92) + %tmp9_26 = tt.addptr %tmp0, %tmp9_25 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc93) + %tmp9_27 = tt.load %tmp9_26, %tmp8_21, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc94) + %tmp9_28 = arith.extf %tmp9_27 : tensor<512xbf16> to tensor<512xf32> loc(#loc95) + %tmp10 = arith.subf %cst_0, %tmp9_28 : tensor<512xf32> loc(#loc96) + %tmp13 = arith.cmpi sge, %tmp8, %cst_2 : tensor<512xi64> loc(#loc97) + %tmp16 = arith.addi %tmp9_22, %tmp9_24 : tensor<512xi32> loc(#loc98) + %tmp16_29 = tt.addptr %tmp0, %tmp16 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc99) + %tmp16_30 = tt.load %tmp16_29, %tmp13, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc100) + %tmp16_31 = arith.extf %tmp16_30 : tensor<512xbf16> to tensor<512xf32> loc(#loc101) + %tmp17 = arith.select %tmp8_21, %tmp10, %tmp16_31 : tensor<512xi1>, tensor<512xf32> loc(#loc118) + %tmp20 = arith.mulf %tmp17, %tmp19_17 : tensor<512xf32> loc(#loc104) + %tmp21 = arith.addf %tmp3, %tmp20 : tensor<512xf32> loc(#loc105) + %tmp25 = arith.mulf %tmp23_20, %tmp2_15 : tensor<512xf32> loc(#loc106) + %tmp26 = tt.addptr %tmp23, %tmp9_25 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc107) + %tmp26_32 = tt.load %tmp26, %tmp8_21, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc108) + %tmp26_33 = arith.extf %tmp26_32 : tensor<512xbf16> to tensor<512xf32> loc(#loc109) + %tmp27 = arith.subf %cst_0, %tmp26_33 : tensor<512xf32> loc(#loc110) + %tmp30 = tt.addptr %tmp23, %tmp16 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc111) + %tmp30_34 = tt.load %tmp30, %tmp13, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc112) + %tmp30_35 = arith.extf %tmp30_34 : tensor<512xbf16> to tensor<512xf32> loc(#loc113) + %tmp31 = arith.select %tmp8_21, %tmp27, %tmp30_35 : tensor<512xi1>, tensor<512xf32> loc(#loc119) + %tmp33 = arith.mulf %tmp31, %tmp19_17 : tensor<512xf32> loc(#loc116) + %tmp34 = arith.addf %tmp25, %tmp33 : tensor<512xf32> loc(#loc117) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc54) + %1 = tt.addptr %0, %xindex_7 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc54) + %2 = arith.truncf %tmp21 : tensor<512xf32> to tensor<512xbf16> loc(#loc55) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc55) + %3 = tt.splat %out_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc56) + %4 = tt.addptr %3, %xindex_7 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc56) + %5 = arith.truncf %tmp34 : tensor<512xf32> to tensor<512xbf16> loc(#loc57) + tt.store %4, %5 : tensor<512x!tt.ptr> loc(#loc57) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4) +#loc66 = loc("x2"(#loc2)) +#loc67 = loc("xoffset"(#loc3)) +#loc68 = loc("xoffset"(#loc4)) +#loc69 = loc("xindex"(#loc5)) +#loc70 = loc("xindex"(#loc6)) +#loc71 = loc("x0"(#loc7)) +#loc72 = loc("x4"(#loc8)) +#loc73 = loc("tmp0"(#loc9)) +#loc74 = loc("tmp0"(#loc10)) +#loc75 = loc("tmp0"(#loc11)) +#loc76 = loc("tmp2"(#loc12)) +#loc77 = loc("tmp2"(#loc13)) +#loc78 = loc("tmp2"(#loc14)) +#loc79 = loc("tmp2"(#loc15)) +#loc80 = loc("tmp19"(#loc16)) +#loc81 = loc("tmp19"(#loc17)) +#loc82 = loc("tmp23"(#loc18)) +#loc83 = loc("tmp23"(#loc19)) +#loc84 = loc("tmp23"(#loc20)) +#loc85 = loc("tmp3"(#loc21)) +#loc86 = loc("tmp4"(#loc22)) +#loc87 = loc("tmp8"(#loc23)) +#loc88 = loc("tmp9"(#loc24)) +#loc89 = loc("tmp9"(#loc25)) +#loc90 = loc("tmp9"(#loc26)) +#loc91 = loc("tmp9"(#loc27)) +#loc92 = loc("tmp9"(#loc28)) +#loc93 = loc("tmp9"(#loc29)) +#loc94 = loc("tmp9"(#loc30)) +#loc95 = loc("tmp9"(#loc31)) +#loc96 = loc("tmp10"(#loc32)) +#loc97 = loc("tmp13"(#loc33)) +#loc98 = loc("tmp16"(#loc34)) +#loc99 = loc("tmp16"(#loc35)) +#loc100 = loc("tmp16"(#loc36)) +#loc101 = loc("tmp16"(#loc37)) +#loc102 = loc("tmp17"(#loc38)) +#loc103 = loc("tmp12"(#loc39)) +#loc104 = loc("tmp20"(#loc40)) +#loc105 = loc("tmp21"(#loc41)) +#loc106 = loc("tmp25"(#loc42)) +#loc107 = loc("tmp26"(#loc43)) +#loc108 = loc("tmp26"(#loc44)) +#loc109 = loc("tmp26"(#loc45)) +#loc110 = loc("tmp27"(#loc46)) +#loc111 = loc("tmp30"(#loc47)) +#loc112 = loc("tmp30"(#loc48)) +#loc113 = loc("tmp30"(#loc49)) +#loc114 = loc("tmp31"(#loc50)) +#loc115 = loc("tmp29"(#loc51)) +#loc116 = loc("tmp33"(#loc52)) +#loc117 = loc("tmp34"(#loc53)) +#loc118 = loc(fused[#loc102, #loc103]) +#loc119 = loc(fused[#loc114, #loc115]) diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..df301e83da3c7ab0e5771f4536b8fd9e8e6a31c7 --- /dev/null +++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.json"}} \ No newline at end of file diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..91e74ff6ccfa30663cead80d6f84b745df26f67d Binary files /dev/null and b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1073740dfa50f56ccce4944c12a3e75b5d2e4b7b --- /dev/null +++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"hash": "af7b1ea6fafc8fbcbf5956ef7ea84751b6e19f3f5441503720e7ca7b616ddf6c", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 8192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"} \ No newline at end of file diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..e6bfbc847213ca0c458db6207e2bc45cd6afabdc --- /dev/null +++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.llir @@ -0,0 +1,1284 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10 + %15 = mul nuw i32 %13, %14, !dbg !11 + %16 = add nuw i32 %15, %12, !dbg !12 + %17 = shl i32 %16, 8, !dbg !13 + %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14 + %19 = lshr i32 %18, 1, !dbg !14 + %20 = and i32 %19, 127, !dbg !14 + %21 = shl nuw nsw i32 %18, 2, !dbg !14 + %22 = and i32 %21, 252, !dbg !14 + %23 = or disjoint i32 %17, %20, !dbg !15 + %24 = or disjoint i32 %23, 128, !dbg !15 + %25 = or disjoint i32 %17, %22, !dbg !15 + %26 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16 + %27 = shl i32 %26, 4, !dbg !17 + %28 = and i32 %18, 1, !dbg !18 + %29 = icmp eq i32 %28, 0, !dbg !18 + %30 = shl nuw nsw i32 %28, 3, !dbg !18 + %31 = lshr i32 %18, 6, !dbg !18 + %32 = and i32 %31, 3, !dbg !18 + %33 = or disjoint i32 %30, %27, !dbg !19 + %34 = or disjoint i32 %32, %27, !dbg !19 + %35 = icmp slt i32 %33, 128, !dbg !20 + %36 = icmp slt i32 %34, 128, !dbg !20 + %37 = sdiv i32 %23, 32, !dbg !21 + %38 = sdiv i32 %24, 32, !dbg !21 + %39 = sdiv i32 %25, 32, !dbg !21 + %40 = mul i32 %37, 32, !dbg !22 + %.decomposed = sub i32 %23, %40, !dbg !22 + %41 = mul i32 %39, 32, !dbg !22 + %.decomposed148 = sub i32 %25, %41, !dbg !22 + %42 = icmp slt i32 %23, 8192, !dbg !23 + %43 = icmp slt i32 %25, 8192, !dbg !23 + %44 = shl nsw i32 %.decomposed, 7, !dbg !24 + %45 = add i32 %44, %33, !dbg !25 + %46 = mul i32 %37, 12288, !dbg !26 + %47 = mul i32 %38, 12288, !dbg !26 + %48 = add i32 %45, %46, !dbg !27 + %49 = add i32 %45, %47, !dbg !27 + %50 = sext i32 %48 to i64, !dbg !28 + %51 = getelementptr bfloat, ptr addrspace(1) %0, i64 %50, !dbg !28 + %52 = sext i32 %49 to i64, !dbg !28 + %53 = getelementptr bfloat, ptr addrspace(1) %0, i64 %52, !dbg !28 + %54 = and i1 %35, %42, !dbg !29 + %55 = and i1 %36, %43, !dbg !29 + %56 = icmp slt i32 %23, 8064, !dbg !30 + %57 = and i1 %35, %56, !dbg !30 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !31 + %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %51, i64 %58, i1 %54) #6, !dbg !31 + %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !31 + %61 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !31 + %62 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !31 + %63 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !31 + %64 = insertelement <2 x i32> poison, i32 %60, i64 0, !dbg !31 + %65 = insertelement <2 x i32> %64, i32 %62, i64 1, !dbg !31 + %66 = lshr <2 x i32> %65, splat (i32 16), !dbg !31 + %67 = trunc nuw <2 x i32> %66 to <2 x i16>, !dbg !31 + %68 = insertelement <2 x i32> poison, i32 %61, i64 0, !dbg !31 + %69 = insertelement <2 x i32> %68, i32 %63, i64 1, !dbg !31 + %70 = lshr <2 x i32> %69, splat (i32 16), !dbg !31 + %71 = trunc nuw <2 x i32> %70 to <2 x i16>, !dbg !31 + %72 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !31 + %73 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %53, i64 %72, i1 %57) #6, !dbg !31 + %74 = extractvalue { i32, i32, i32, i32 } %73, 0, !dbg !31 + %75 = extractvalue { i32, i32, i32, i32 } %73, 1, !dbg !31 + %76 = extractvalue { i32, i32, i32, i32 } %73, 2, !dbg !31 + %77 = extractvalue { i32, i32, i32, i32 } %73, 3, !dbg !31 + %78 = insertelement <2 x i32> poison, i32 %74, i64 0, !dbg !31 + %79 = insertelement <2 x i32> %78, i32 %76, i64 1, !dbg !31 + %80 = lshr <2 x i32> %79, splat (i32 16), !dbg !31 + %81 = trunc nuw <2 x i32> %80 to <2 x i16>, !dbg !31 + %82 = insertelement <2 x i32> poison, i32 %75, i64 0, !dbg !31 + %83 = insertelement <2 x i32> %82, i32 %77, i64 1, !dbg !31 + %84 = lshr <2 x i32> %83, splat (i32 16), !dbg !31 + %85 = trunc nuw <2 x i32> %84 to <2 x i16>, !dbg !31 + %86 = and i32 %18, 6, !dbg !32 + %87 = and i32 %18, 120, !dbg !32 + %88 = shl nuw nsw i32 %28, 2, !dbg !32 + %89 = and i32 %18, 128, !dbg !32 + %90 = icmp eq i32 %89, 0, !dbg !32 + %91 = select i1 %90, i32 0, i32 4100, !dbg !32 + %92 = mul nuw nsw i32 %86, 528, !dbg !32 + %93 = or disjoint i32 %88, %87, !dbg !32 + %94 = xor i32 %92, %93, !dbg !32 + %95 = xor i32 %94, %91, !dbg !32 + %96 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %95, !dbg !32 + %97 = trunc i32 %60 to i16, !dbg !32 + %98 = trunc i32 %62 to i16, !dbg !32 + %99 = insertelement <2 x i16> poison, i16 %97, i64 0, !dbg !32 + %100 = insertelement <2 x i16> %99, i16 %98, i64 1, !dbg !32 + store <2 x i16> %100, ptr addrspace(3) %96, align 4, !dbg !32 + %101 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 256, !dbg !32 + store <2 x i16> %67, ptr addrspace(3) %101, align 4, !dbg !32 + %102 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 512, !dbg !32 + %103 = trunc i32 %61 to i16, !dbg !32 + %104 = trunc i32 %63 to i16, !dbg !32 + %105 = insertelement <2 x i16> poison, i16 %103, i64 0, !dbg !32 + %106 = insertelement <2 x i16> %105, i16 %104, i64 1, !dbg !32 + store <2 x i16> %106, ptr addrspace(3) %102, align 4, !dbg !32 + %107 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 768, !dbg !32 + store <2 x i16> %71, ptr addrspace(3) %107, align 4, !dbg !32 + %108 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 128, !dbg !32 + %109 = trunc i32 %74 to i16, !dbg !32 + %110 = trunc i32 %76 to i16, !dbg !32 + %111 = insertelement <2 x i16> poison, i16 %109, i64 0, !dbg !32 + %112 = insertelement <2 x i16> %111, i16 %110, i64 1, !dbg !32 + store <2 x i16> %112, ptr addrspace(3) %108, align 4, !dbg !32 + %113 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 384, !dbg !32 + store <2 x i16> %81, ptr addrspace(3) %113, align 4, !dbg !32 + %114 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 640, !dbg !32 + %115 = trunc i32 %75 to i16, !dbg !32 + %116 = trunc i32 %77 to i16, !dbg !32 + %117 = insertelement <2 x i16> poison, i16 %115, i64 0, !dbg !32 + %118 = insertelement <2 x i16> %117, i16 %116, i64 1, !dbg !32 + store <2 x i16> %118, ptr addrspace(3) %114, align 4, !dbg !32 + %119 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 896, !dbg !32 + store <2 x i16> %85, ptr addrspace(3) %119, align 4, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !32 + %120 = shl nuw nsw i32 %18, 3, !dbg !32 + %121 = and i32 %120, 120, !dbg !32 + %122 = and i32 %18, 224, !dbg !32 + %123 = shl nuw nsw i32 %122, 2, !dbg !32 + %124 = and i32 %18, 16, !dbg !32 + %125 = icmp eq i32 %124, 0, !dbg !32 + %126 = select i1 %125, i32 0, i32 4100, !dbg !32 + %127 = or disjoint i32 %126, %123, !dbg !32 + %128 = or disjoint i32 %127, %121, !dbg !32 + %129 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %128, !dbg !32 + %130 = load bfloat, ptr addrspace(3) %129, align 4, !dbg !32 + %131 = getelementptr inbounds nuw i8, ptr addrspace(3) %129, i32 2, !dbg !32 + %132 = load bfloat, ptr addrspace(3) %131, align 2, !dbg !32 + %133 = xor i32 %128, 1056, !dbg !32 + %134 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %133, !dbg !32 + %135 = load bfloat, ptr addrspace(3) %134, align 4, !dbg !32 + %136 = getelementptr inbounds nuw i8, ptr addrspace(3) %134, i32 2, !dbg !32 + %137 = load bfloat, ptr addrspace(3) %136, align 2, !dbg !32 + %138 = xor i32 %128, 2112, !dbg !32 + %139 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %138, !dbg !32 + %140 = load bfloat, ptr addrspace(3) %139, align 4, !dbg !32 + %141 = getelementptr inbounds nuw i8, ptr addrspace(3) %139, i32 2, !dbg !32 + %142 = load bfloat, ptr addrspace(3) %141, align 2, !dbg !32 + %143 = xor i32 %128, 3168, !dbg !32 + %144 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %143, !dbg !32 + %145 = load bfloat, ptr addrspace(3) %144, align 4, !dbg !32 + %146 = getelementptr inbounds nuw i8, ptr addrspace(3) %144, i32 2, !dbg !32 + %147 = load bfloat, ptr addrspace(3) %146, align 2, !dbg !32 + %148 = xor i32 %128, 4, !dbg !32 + %149 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %148, !dbg !32 + %150 = load bfloat, ptr addrspace(3) %149, align 4, !dbg !32 + %151 = getelementptr inbounds nuw i8, ptr addrspace(3) %149, i32 2, !dbg !32 + %152 = load bfloat, ptr addrspace(3) %151, align 2, !dbg !32 + %153 = xor i32 %128, 1060, !dbg !32 + %154 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %153, !dbg !32 + %155 = load bfloat, ptr addrspace(3) %154, align 4, !dbg !32 + %156 = getelementptr inbounds nuw i8, ptr addrspace(3) %154, i32 2, !dbg !32 + %157 = load bfloat, ptr addrspace(3) %156, align 2, !dbg !32 + %158 = xor i32 %128, 2116, !dbg !32 + %159 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %158, !dbg !32 + %160 = load bfloat, ptr addrspace(3) %159, align 4, !dbg !32 + %161 = getelementptr inbounds nuw i8, ptr addrspace(3) %159, i32 2, !dbg !32 + %162 = load bfloat, ptr addrspace(3) %161, align 2, !dbg !32 + %163 = xor i32 %128, 3172, !dbg !32 + %164 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %163, !dbg !32 + %165 = load bfloat, ptr addrspace(3) %164, align 4, !dbg !32 + %166 = getelementptr inbounds nuw i8, ptr addrspace(3) %164, i32 2, !dbg !32 + %167 = load bfloat, ptr addrspace(3) %166, align 2, !dbg !32 + %168 = insertelement <4 x bfloat> poison, bfloat %130, i64 0, !dbg !32 + %169 = insertelement <4 x bfloat> %168, bfloat %150, i64 1, !dbg !32 + %170 = insertelement <4 x bfloat> %169, bfloat %135, i64 2, !dbg !32 + %171 = insertelement <4 x bfloat> %170, bfloat %155, i64 3, !dbg !32 + %172 = fpext <4 x bfloat> %171 to <4 x float>, !dbg !32 + %173 = insertelement <4 x bfloat> poison, bfloat %140, i64 0, !dbg !32 + %174 = insertelement <4 x bfloat> %173, bfloat %160, i64 1, !dbg !32 + %175 = insertelement <4 x bfloat> %174, bfloat %145, i64 2, !dbg !32 + %176 = insertelement <4 x bfloat> %175, bfloat %165, i64 3, !dbg !32 + %177 = fpext <4 x bfloat> %176 to <4 x float>, !dbg !32 + %178 = insertelement <4 x bfloat> poison, bfloat %132, i64 0, !dbg !32 + %179 = insertelement <4 x bfloat> %178, bfloat %152, i64 1, !dbg !32 + %180 = insertelement <4 x bfloat> %179, bfloat %137, i64 2, !dbg !32 + %181 = insertelement <4 x bfloat> %180, bfloat %157, i64 3, !dbg !32 + %182 = fpext <4 x bfloat> %181 to <4 x float>, !dbg !32 + %183 = insertelement <4 x bfloat> poison, bfloat %142, i64 0, !dbg !32 + %184 = insertelement <4 x bfloat> %183, bfloat %162, i64 1, !dbg !32 + %185 = insertelement <4 x bfloat> %184, bfloat %147, i64 2, !dbg !32 + %186 = insertelement <4 x bfloat> %185, bfloat %167, i64 3, !dbg !32 + %187 = fpext <4 x bfloat> %186 to <4 x float>, !dbg !32 + %188 = sext i32 %25 to i64, !dbg !33 + %189 = getelementptr float, ptr addrspace(1) %1, i64 %188, !dbg !33 + %190 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !34 + %191 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %189, i64 %190, i1 %55) #6, !dbg !34 + %192 = extractvalue { i32, i32, i32, i32 } %191, 0, !dbg !34 + %193 = extractvalue { i32, i32, i32, i32 } %191, 1, !dbg !34 + %194 = extractvalue { i32, i32, i32, i32 } %191, 2, !dbg !34 + %195 = extractvalue { i32, i32, i32, i32 } %191, 3, !dbg !34 + %196 = bitcast i32 %192 to float, !dbg !34 + %197 = bitcast i32 %193 to float, !dbg !34 + %198 = bitcast i32 %194 to float, !dbg !34 + %199 = bitcast i32 %195 to float, !dbg !34 + %200 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !34 + %201 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %189, i64 %200, i1 %55) #6, !dbg !34 + %202 = extractvalue { i32, i32, i32, i32 } %201, 0, !dbg !34 + %203 = extractvalue { i32, i32, i32, i32 } %201, 1, !dbg !34 + %204 = extractvalue { i32, i32, i32, i32 } %201, 2, !dbg !34 + %205 = extractvalue { i32, i32, i32, i32 } %201, 3, !dbg !34 + %206 = bitcast i32 %202 to float, !dbg !34 + %207 = bitcast i32 %203 to float, !dbg !34 + %208 = bitcast i32 %204 to float, !dbg !34 + %209 = bitcast i32 %205 to float, !dbg !34 + %210 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !34 + %211 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %189, i64 %210, i1 %55) #6, !dbg !34 + %212 = extractvalue { i32, i32, i32, i32 } %211, 0, !dbg !34 + %213 = extractvalue { i32, i32, i32, i32 } %211, 1, !dbg !34 + %214 = extractvalue { i32, i32, i32, i32 } %211, 2, !dbg !34 + %215 = extractvalue { i32, i32, i32, i32 } %211, 3, !dbg !34 + %216 = bitcast i32 %212 to float, !dbg !34 + %217 = bitcast i32 %213 to float, !dbg !34 + %218 = bitcast i32 %214 to float, !dbg !34 + %219 = bitcast i32 %215 to float, !dbg !34 + %220 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !34 + %221 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %189, i64 %220, i1 %55) #6, !dbg !34 + %222 = extractvalue { i32, i32, i32, i32 } %221, 0, !dbg !34 + %223 = extractvalue { i32, i32, i32, i32 } %221, 1, !dbg !34 + %224 = extractvalue { i32, i32, i32, i32 } %221, 2, !dbg !34 + %225 = extractvalue { i32, i32, i32, i32 } %221, 3, !dbg !34 + %226 = bitcast i32 %222 to float, !dbg !34 + %227 = bitcast i32 %223 to float, !dbg !34 + %228 = bitcast i32 %224 to float, !dbg !34 + %229 = bitcast i32 %225 to float, !dbg !34 + %230 = tail call float @llvm.nvvm.div.full(float %196, float 1.280000e+02), !dbg !35 + %231 = tail call float @llvm.nvvm.div.full(float %197, float 1.280000e+02), !dbg !35 + %232 = tail call float @llvm.nvvm.div.full(float %198, float 1.280000e+02), !dbg !35 + %233 = tail call float @llvm.nvvm.div.full(float %199, float 1.280000e+02), !dbg !35 + %234 = tail call float @llvm.nvvm.div.full(float %206, float 1.280000e+02), !dbg !35 + %235 = tail call float @llvm.nvvm.div.full(float %207, float 1.280000e+02), !dbg !35 + %236 = tail call float @llvm.nvvm.div.full(float %208, float 1.280000e+02), !dbg !35 + %237 = tail call float @llvm.nvvm.div.full(float %209, float 1.280000e+02), !dbg !35 + %238 = tail call float @llvm.nvvm.div.full(float %216, float 1.280000e+02), !dbg !35 + %239 = tail call float @llvm.nvvm.div.full(float %217, float 1.280000e+02), !dbg !35 + %240 = tail call float @llvm.nvvm.div.full(float %218, float 1.280000e+02), !dbg !35 + %241 = tail call float @llvm.nvvm.div.full(float %219, float 1.280000e+02), !dbg !35 + %242 = tail call float @llvm.nvvm.div.full(float %226, float 1.280000e+02), !dbg !35 + %243 = tail call float @llvm.nvvm.div.full(float %227, float 1.280000e+02), !dbg !35 + %244 = tail call float @llvm.nvvm.div.full(float %228, float 1.280000e+02), !dbg !35 + %245 = tail call float @llvm.nvvm.div.full(float %229, float 1.280000e+02), !dbg !35 + %246 = fadd float %230, 0x3EB0C6F7A0000000, !dbg !36 + %247 = fadd float %231, 0x3EB0C6F7A0000000, !dbg !36 + %248 = fadd float %232, 0x3EB0C6F7A0000000, !dbg !36 + %249 = fadd float %233, 0x3EB0C6F7A0000000, !dbg !36 + %250 = fadd float %234, 0x3EB0C6F7A0000000, !dbg !36 + %251 = fadd float %235, 0x3EB0C6F7A0000000, !dbg !36 + %252 = fadd float %236, 0x3EB0C6F7A0000000, !dbg !36 + %253 = fadd float %237, 0x3EB0C6F7A0000000, !dbg !36 + %254 = fadd float %238, 0x3EB0C6F7A0000000, !dbg !36 + %255 = fadd float %239, 0x3EB0C6F7A0000000, !dbg !36 + %256 = fadd float %240, 0x3EB0C6F7A0000000, !dbg !36 + %257 = fadd float %241, 0x3EB0C6F7A0000000, !dbg !36 + %258 = fadd float %242, 0x3EB0C6F7A0000000, !dbg !36 + %259 = fadd float %243, 0x3EB0C6F7A0000000, !dbg !36 + %260 = fadd float %244, 0x3EB0C6F7A0000000, !dbg !36 + %261 = fadd float %245, 0x3EB0C6F7A0000000, !dbg !36 + %262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i = icmp eq i32 %262, 0, !dbg !37 + br i1 %.not.i, label %265, label %263, !dbg !37 + +263: ; preds = %11 + %264 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %246), !dbg !37 + br label %__nv_rsqrtf.exit, !dbg !37 + +265: ; preds = %11 + %266 = tail call float @llvm.nvvm.rsqrt.approx.f(float %246), !dbg !37 + br label %__nv_rsqrtf.exit, !dbg !37 + +__nv_rsqrtf.exit: ; preds = %263, %265 + %.0.i = phi float [ %264, %263 ], [ %266, %265 ], !dbg !37 + %267 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i55 = icmp eq i32 %267, 0, !dbg !37 + br i1 %.not.i55, label %270, label %268, !dbg !37 + +268: ; preds = %__nv_rsqrtf.exit + %269 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %247), !dbg !37 + br label %__nv_rsqrtf.exit57, !dbg !37 + +270: ; preds = %__nv_rsqrtf.exit + %271 = tail call float @llvm.nvvm.rsqrt.approx.f(float %247), !dbg !37 + br label %__nv_rsqrtf.exit57, !dbg !37 + +__nv_rsqrtf.exit57: ; preds = %268, %270 + %.0.i56 = phi float [ %269, %268 ], [ %271, %270 ], !dbg !37 + %272 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i58 = icmp eq i32 %272, 0, !dbg !37 + br i1 %.not.i58, label %275, label %273, !dbg !37 + +273: ; preds = %__nv_rsqrtf.exit57 + %274 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %248), !dbg !37 + br label %__nv_rsqrtf.exit60, !dbg !37 + +275: ; preds = %__nv_rsqrtf.exit57 + %276 = tail call float @llvm.nvvm.rsqrt.approx.f(float %248), !dbg !37 + br label %__nv_rsqrtf.exit60, !dbg !37 + +__nv_rsqrtf.exit60: ; preds = %273, %275 + %.0.i59 = phi float [ %274, %273 ], [ %276, %275 ], !dbg !37 + %277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i61 = icmp eq i32 %277, 0, !dbg !37 + br i1 %.not.i61, label %280, label %278, !dbg !37 + +278: ; preds = %__nv_rsqrtf.exit60 + %279 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %249), !dbg !37 + br label %__nv_rsqrtf.exit63, !dbg !37 + +280: ; preds = %__nv_rsqrtf.exit60 + %281 = tail call float @llvm.nvvm.rsqrt.approx.f(float %249), !dbg !37 + br label %__nv_rsqrtf.exit63, !dbg !37 + +__nv_rsqrtf.exit63: ; preds = %278, %280 + %.0.i62 = phi float [ %279, %278 ], [ %281, %280 ], !dbg !37 + %282 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i64 = icmp eq i32 %282, 0, !dbg !37 + br i1 %.not.i64, label %285, label %283, !dbg !37 + +283: ; preds = %__nv_rsqrtf.exit63 + %284 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %250), !dbg !37 + br label %__nv_rsqrtf.exit66, !dbg !37 + +285: ; preds = %__nv_rsqrtf.exit63 + %286 = tail call float @llvm.nvvm.rsqrt.approx.f(float %250), !dbg !37 + br label %__nv_rsqrtf.exit66, !dbg !37 + +__nv_rsqrtf.exit66: ; preds = %283, %285 + %.0.i65 = phi float [ %284, %283 ], [ %286, %285 ], !dbg !37 + %287 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i67 = icmp eq i32 %287, 0, !dbg !37 + br i1 %.not.i67, label %290, label %288, !dbg !37 + +288: ; preds = %__nv_rsqrtf.exit66 + %289 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %251), !dbg !37 + br label %__nv_rsqrtf.exit69, !dbg !37 + +290: ; preds = %__nv_rsqrtf.exit66 + %291 = tail call float @llvm.nvvm.rsqrt.approx.f(float %251), !dbg !37 + br label %__nv_rsqrtf.exit69, !dbg !37 + +__nv_rsqrtf.exit69: ; preds = %288, %290 + %.0.i68 = phi float [ %289, %288 ], [ %291, %290 ], !dbg !37 + %292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i70 = icmp eq i32 %292, 0, !dbg !37 + br i1 %.not.i70, label %295, label %293, !dbg !37 + +293: ; preds = %__nv_rsqrtf.exit69 + %294 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %252), !dbg !37 + br label %__nv_rsqrtf.exit72, !dbg !37 + +295: ; preds = %__nv_rsqrtf.exit69 + %296 = tail call float @llvm.nvvm.rsqrt.approx.f(float %252), !dbg !37 + br label %__nv_rsqrtf.exit72, !dbg !37 + +__nv_rsqrtf.exit72: ; preds = %293, %295 + %.0.i71 = phi float [ %294, %293 ], [ %296, %295 ], !dbg !37 + %297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i73 = icmp eq i32 %297, 0, !dbg !37 + br i1 %.not.i73, label %300, label %298, !dbg !37 + +298: ; preds = %__nv_rsqrtf.exit72 + %299 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %253), !dbg !37 + br label %__nv_rsqrtf.exit75, !dbg !37 + +300: ; preds = %__nv_rsqrtf.exit72 + %301 = tail call float @llvm.nvvm.rsqrt.approx.f(float %253), !dbg !37 + br label %__nv_rsqrtf.exit75, !dbg !37 + +__nv_rsqrtf.exit75: ; preds = %298, %300 + %.0.i74 = phi float [ %299, %298 ], [ %301, %300 ], !dbg !37 + %302 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i76 = icmp eq i32 %302, 0, !dbg !37 + br i1 %.not.i76, label %305, label %303, !dbg !37 + +303: ; preds = %__nv_rsqrtf.exit75 + %304 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %254), !dbg !37 + br label %__nv_rsqrtf.exit78, !dbg !37 + +305: ; preds = %__nv_rsqrtf.exit75 + %306 = tail call float @llvm.nvvm.rsqrt.approx.f(float %254), !dbg !37 + br label %__nv_rsqrtf.exit78, !dbg !37 + +__nv_rsqrtf.exit78: ; preds = %303, %305 + %.0.i77 = phi float [ %304, %303 ], [ %306, %305 ], !dbg !37 + %307 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i79 = icmp eq i32 %307, 0, !dbg !37 + br i1 %.not.i79, label %310, label %308, !dbg !37 + +308: ; preds = %__nv_rsqrtf.exit78 + %309 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %255), !dbg !37 + br label %__nv_rsqrtf.exit81, !dbg !37 + +310: ; preds = %__nv_rsqrtf.exit78 + %311 = tail call float @llvm.nvvm.rsqrt.approx.f(float %255), !dbg !37 + br label %__nv_rsqrtf.exit81, !dbg !37 + +__nv_rsqrtf.exit81: ; preds = %308, %310 + %.0.i80 = phi float [ %309, %308 ], [ %311, %310 ], !dbg !37 + %312 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i82 = icmp eq i32 %312, 0, !dbg !37 + br i1 %.not.i82, label %315, label %313, !dbg !37 + +313: ; preds = %__nv_rsqrtf.exit81 + %314 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %256), !dbg !37 + br label %__nv_rsqrtf.exit84, !dbg !37 + +315: ; preds = %__nv_rsqrtf.exit81 + %316 = tail call float @llvm.nvvm.rsqrt.approx.f(float %256), !dbg !37 + br label %__nv_rsqrtf.exit84, !dbg !37 + +__nv_rsqrtf.exit84: ; preds = %313, %315 + %.0.i83 = phi float [ %314, %313 ], [ %316, %315 ], !dbg !37 + %317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i85 = icmp eq i32 %317, 0, !dbg !37 + br i1 %.not.i85, label %320, label %318, !dbg !37 + +318: ; preds = %__nv_rsqrtf.exit84 + %319 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %257), !dbg !37 + br label %__nv_rsqrtf.exit87, !dbg !37 + +320: ; preds = %__nv_rsqrtf.exit84 + %321 = tail call float @llvm.nvvm.rsqrt.approx.f(float %257), !dbg !37 + br label %__nv_rsqrtf.exit87, !dbg !37 + +__nv_rsqrtf.exit87: ; preds = %318, %320 + %.0.i86 = phi float [ %319, %318 ], [ %321, %320 ], !dbg !37 + %322 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i88 = icmp eq i32 %322, 0, !dbg !37 + br i1 %.not.i88, label %325, label %323, !dbg !37 + +323: ; preds = %__nv_rsqrtf.exit87 + %324 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %258), !dbg !37 + br label %__nv_rsqrtf.exit90, !dbg !37 + +325: ; preds = %__nv_rsqrtf.exit87 + %326 = tail call float @llvm.nvvm.rsqrt.approx.f(float %258), !dbg !37 + br label %__nv_rsqrtf.exit90, !dbg !37 + +__nv_rsqrtf.exit90: ; preds = %323, %325 + %.0.i89 = phi float [ %324, %323 ], [ %326, %325 ], !dbg !37 + %327 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i91 = icmp eq i32 %327, 0, !dbg !37 + br i1 %.not.i91, label %330, label %328, !dbg !37 + +328: ; preds = %__nv_rsqrtf.exit90 + %329 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %259), !dbg !37 + br label %__nv_rsqrtf.exit93, !dbg !37 + +330: ; preds = %__nv_rsqrtf.exit90 + %331 = tail call float @llvm.nvvm.rsqrt.approx.f(float %259), !dbg !37 + br label %__nv_rsqrtf.exit93, !dbg !37 + +__nv_rsqrtf.exit93: ; preds = %328, %330 + %.0.i92 = phi float [ %329, %328 ], [ %331, %330 ], !dbg !37 + %332 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i94 = icmp eq i32 %332, 0, !dbg !37 + br i1 %.not.i94, label %335, label %333, !dbg !37 + +333: ; preds = %__nv_rsqrtf.exit93 + %334 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %260), !dbg !37 + br label %__nv_rsqrtf.exit96, !dbg !37 + +335: ; preds = %__nv_rsqrtf.exit93 + %336 = tail call float @llvm.nvvm.rsqrt.approx.f(float %260), !dbg !37 + br label %__nv_rsqrtf.exit96, !dbg !37 + +__nv_rsqrtf.exit96: ; preds = %333, %335 + %.0.i95 = phi float [ %334, %333 ], [ %336, %335 ], !dbg !37 + %337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37 + %.not.i97 = icmp eq i32 %337, 0, !dbg !37 + br i1 %.not.i97, label %340, label %338, !dbg !37 + +338: ; preds = %__nv_rsqrtf.exit96 + %339 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %261), !dbg !37 + br label %__nv_rsqrtf.exit99, !dbg !37 + +340: ; preds = %__nv_rsqrtf.exit96 + %341 = tail call float @llvm.nvvm.rsqrt.approx.f(float %261), !dbg !37 + br label %__nv_rsqrtf.exit99, !dbg !37 + +__nv_rsqrtf.exit99: ; preds = %338, %340 + %.0.i98 = phi float [ %339, %338 ], [ %341, %340 ], !dbg !37 + %342 = insertelement <4 x float> poison, float %.0.i, i64 0, !dbg !38 + %343 = insertelement <4 x float> %342, float %.0.i77, i64 1, !dbg !38 + %344 = insertelement <4 x float> %343, float %.0.i56, i64 2, !dbg !38 + %345 = insertelement <4 x float> %344, float %.0.i80, i64 3, !dbg !38 + %346 = fmul <4 x float> %345, %172, !dbg !38 + %347 = insertelement <4 x float> poison, float %.0.i59, i64 0, !dbg !38 + %348 = insertelement <4 x float> %347, float %.0.i83, i64 1, !dbg !38 + %349 = insertelement <4 x float> %348, float %.0.i62, i64 2, !dbg !38 + %350 = insertelement <4 x float> %349, float %.0.i86, i64 3, !dbg !38 + %351 = fmul <4 x float> %350, %177, !dbg !38 + %352 = insertelement <4 x float> poison, float %.0.i65, i64 0, !dbg !38 + %353 = insertelement <4 x float> %352, float %.0.i89, i64 1, !dbg !38 + %354 = insertelement <4 x float> %353, float %.0.i68, i64 2, !dbg !38 + %355 = insertelement <4 x float> %354, float %.0.i92, i64 3, !dbg !38 + %356 = fmul <4 x float> %355, %182, !dbg !38 + %357 = insertelement <4 x float> poison, float %.0.i71, i64 0, !dbg !38 + %358 = insertelement <4 x float> %357, float %.0.i95, i64 1, !dbg !38 + %359 = insertelement <4 x float> %358, float %.0.i74, i64 2, !dbg !38 + %360 = insertelement <4 x float> %359, float %.0.i98, i64 3, !dbg !38 + %361 = fmul <4 x float> %360, %187, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %362 = shl nuw nsw i32 %18, 4, !dbg !38 + %363 = and i32 %362, 4080, !dbg !38 + %364 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %363, !dbg !38 + store <4 x float> %346, ptr addrspace(3) %364, align 16, !dbg !38 + %365 = xor i32 %363, 4160, !dbg !38 + %366 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %365, !dbg !38 + store <4 x float> %351, ptr addrspace(3) %366, align 16, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %367 = shl nuw nsw i32 %18, 7, !dbg !38 + %368 = and i32 %367, 3072, !dbg !38 + %369 = shl nuw nsw i32 %86, 3, !dbg !38 + %370 = shl nuw nsw i32 %122, 1, !dbg !38 + %371 = select i1 %29, i32 0, i32 4160, !dbg !38 + %372 = xor i32 %371, %370, !dbg !38 + %373 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %368, !dbg !38 + %374 = getelementptr inbounds nuw i8, ptr addrspace(3) %373, i32 %369, !dbg !38 + %375 = getelementptr inbounds nuw i8, ptr addrspace(3) %374, i32 %372, !dbg !38 + %376 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %375), !dbg !38 + %377 = getelementptr inbounds nuw i8, ptr addrspace(3) %375, i32 512, !dbg !38 + %378 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) nonnull %377), !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + store <4 x float> %356, ptr addrspace(3) %364, align 16, !dbg !38 + store <4 x float> %361, ptr addrspace(3) %366, align 16, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %379 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %375), !dbg !38 + %380 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) nonnull %377), !dbg !38 + %381 = sext i32 %33 to i64, !dbg !39 + %382 = getelementptr bfloat, ptr addrspace(1) %2, i64 %381, !dbg !39 + %383 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !40 + %384 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %382, i64 %383, i1 %54) #6, !dbg !40 + %385 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !40 + %386 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %382, i64 %385, i1 %57) #6, !dbg !40 + %387 = add i32 %48, -3145728, !dbg !41 + %388 = add i32 %49, -3145728, !dbg !41 + %389 = sext i32 %387 to i64, !dbg !42 + %390 = getelementptr bfloat, ptr addrspace(1) %3, i64 %389, !dbg !42 + %391 = sext i32 %388 to i64, !dbg !42 + %392 = getelementptr bfloat, ptr addrspace(1) %3, i64 %391, !dbg !42 + %393 = add i32 %17, -8192, !dbg !43 + %394 = icmp ult i32 %393, 65536, !dbg !43 + %395 = and i1 %35, %394, !dbg !43 + %396 = add i32 %17, -8064, !dbg !43 + %397 = icmp ult i32 %396, 65664, !dbg !43 + %398 = and i1 %35, %397, !dbg !43 + %399 = and i1 %36, %394, !dbg !43 + %400 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !44 + %401 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %390, i64 %400, i1 %395) #6, !dbg !44 + %402 = extractvalue { i32, i32, i32, i32 } %401, 0, !dbg !44 + %403 = extractvalue { i32, i32, i32, i32 } %401, 1, !dbg !44 + %404 = extractvalue { i32, i32, i32, i32 } %401, 2, !dbg !44 + %405 = extractvalue { i32, i32, i32, i32 } %401, 3, !dbg !44 + %406 = insertelement <2 x i32> poison, i32 %402, i64 0, !dbg !44 + %407 = insertelement <2 x i32> %406, i32 %404, i64 1, !dbg !44 + %408 = lshr <2 x i32> %407, splat (i32 16), !dbg !44 + %409 = trunc nuw <2 x i32> %408 to <2 x i16>, !dbg !44 + %410 = insertelement <2 x i32> poison, i32 %403, i64 0, !dbg !44 + %411 = insertelement <2 x i32> %410, i32 %405, i64 1, !dbg !44 + %412 = lshr <2 x i32> %411, splat (i32 16), !dbg !44 + %413 = trunc nuw <2 x i32> %412 to <2 x i16>, !dbg !44 + %414 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !44 + %415 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %392, i64 %414, i1 %398) #6, !dbg !44 + %416 = extractvalue { i32, i32, i32, i32 } %415, 0, !dbg !44 + %417 = extractvalue { i32, i32, i32, i32 } %415, 1, !dbg !44 + %418 = extractvalue { i32, i32, i32, i32 } %415, 2, !dbg !44 + %419 = extractvalue { i32, i32, i32, i32 } %415, 3, !dbg !44 + %420 = insertelement <2 x i32> poison, i32 %416, i64 0, !dbg !44 + %421 = insertelement <2 x i32> %420, i32 %418, i64 1, !dbg !44 + %422 = lshr <2 x i32> %421, splat (i32 16), !dbg !44 + %423 = trunc nuw <2 x i32> %422 to <2 x i16>, !dbg !44 + %424 = insertelement <2 x i32> poison, i32 %417, i64 0, !dbg !44 + %425 = insertelement <2 x i32> %424, i32 %419, i64 1, !dbg !44 + %426 = lshr <2 x i32> %425, splat (i32 16), !dbg !44 + %427 = trunc nuw <2 x i32> %426 to <2 x i16>, !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %428 = trunc i32 %402 to i16, !dbg !45 + %429 = trunc i32 %404 to i16, !dbg !45 + %430 = insertelement <2 x i16> poison, i16 %428, i64 0, !dbg !45 + %431 = insertelement <2 x i16> %430, i16 %429, i64 1, !dbg !45 + store <2 x i16> %431, ptr addrspace(3) %96, align 4, !dbg !45 + store <2 x i16> %409, ptr addrspace(3) %101, align 4, !dbg !45 + %432 = trunc i32 %403 to i16, !dbg !45 + %433 = trunc i32 %405 to i16, !dbg !45 + %434 = insertelement <2 x i16> poison, i16 %432, i64 0, !dbg !45 + %435 = insertelement <2 x i16> %434, i16 %433, i64 1, !dbg !45 + store <2 x i16> %435, ptr addrspace(3) %102, align 4, !dbg !45 + store <2 x i16> %413, ptr addrspace(3) %107, align 4, !dbg !45 + %436 = trunc i32 %416 to i16, !dbg !45 + %437 = trunc i32 %418 to i16, !dbg !45 + %438 = insertelement <2 x i16> poison, i16 %436, i64 0, !dbg !45 + %439 = insertelement <2 x i16> %438, i16 %437, i64 1, !dbg !45 + store <2 x i16> %439, ptr addrspace(3) %108, align 4, !dbg !45 + store <2 x i16> %423, ptr addrspace(3) %113, align 4, !dbg !45 + %440 = trunc i32 %417 to i16, !dbg !45 + %441 = trunc i32 %419 to i16, !dbg !45 + %442 = insertelement <2 x i16> poison, i16 %440, i64 0, !dbg !45 + %443 = insertelement <2 x i16> %442, i16 %441, i64 1, !dbg !45 + store <2 x i16> %443, ptr addrspace(3) %114, align 4, !dbg !45 + store <2 x i16> %427, ptr addrspace(3) %119, align 4, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %444 = load <2 x bfloat>, ptr addrspace(3) %129, align 4, !dbg !45 + %445 = load <2 x bfloat>, ptr addrspace(3) %134, align 4, !dbg !45 + %446 = load <2 x bfloat>, ptr addrspace(3) %139, align 4, !dbg !45 + %447 = load <2 x bfloat>, ptr addrspace(3) %144, align 4, !dbg !45 + %448 = load <2 x bfloat>, ptr addrspace(3) %149, align 4, !dbg !45 + %449 = load <2 x bfloat>, ptr addrspace(3) %154, align 4, !dbg !45 + %450 = load <2 x bfloat>, ptr addrspace(3) %159, align 4, !dbg !45 + %451 = load <2 x bfloat>, ptr addrspace(3) %164, align 4, !dbg !45 + %452 = shl nsw i32 %39, 5, !dbg !46 + %453 = add nsw i32 %.decomposed148, -8192, !dbg !46 + %454 = add i32 %453, %452, !dbg !47 + %455 = sext i32 %454 to i64, !dbg !48 + %456 = getelementptr float, ptr addrspace(1) %4, i64 %455, !dbg !48 + %457 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %458 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %456, i64 %457, i1 %399) #6, !dbg !49 + %459 = extractvalue { i32, i32, i32, i32 } %458, 0, !dbg !49 + %460 = extractvalue { i32, i32, i32, i32 } %458, 1, !dbg !49 + %461 = extractvalue { i32, i32, i32, i32 } %458, 2, !dbg !49 + %462 = extractvalue { i32, i32, i32, i32 } %458, 3, !dbg !49 + %463 = bitcast i32 %459 to float, !dbg !49 + %464 = bitcast i32 %460 to float, !dbg !49 + %465 = bitcast i32 %461 to float, !dbg !49 + %466 = bitcast i32 %462 to float, !dbg !49 + %467 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %468 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %456, i64 %467, i1 %399) #6, !dbg !49 + %469 = extractvalue { i32, i32, i32, i32 } %468, 0, !dbg !49 + %470 = extractvalue { i32, i32, i32, i32 } %468, 1, !dbg !49 + %471 = extractvalue { i32, i32, i32, i32 } %468, 2, !dbg !49 + %472 = extractvalue { i32, i32, i32, i32 } %468, 3, !dbg !49 + %473 = bitcast i32 %469 to float, !dbg !49 + %474 = bitcast i32 %470 to float, !dbg !49 + %475 = bitcast i32 %471 to float, !dbg !49 + %476 = bitcast i32 %472 to float, !dbg !49 + %477 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %478 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %456, i64 %477, i1 %399) #6, !dbg !49 + %479 = extractvalue { i32, i32, i32, i32 } %478, 0, !dbg !49 + %480 = extractvalue { i32, i32, i32, i32 } %478, 1, !dbg !49 + %481 = extractvalue { i32, i32, i32, i32 } %478, 2, !dbg !49 + %482 = extractvalue { i32, i32, i32, i32 } %478, 3, !dbg !49 + %483 = bitcast i32 %479 to float, !dbg !49 + %484 = bitcast i32 %480 to float, !dbg !49 + %485 = bitcast i32 %481 to float, !dbg !49 + %486 = bitcast i32 %482 to float, !dbg !49 + %487 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %488 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %456, i64 %487, i1 %399) #6, !dbg !49 + %489 = extractvalue { i32, i32, i32, i32 } %488, 0, !dbg !49 + %490 = extractvalue { i32, i32, i32, i32 } %488, 1, !dbg !49 + %491 = extractvalue { i32, i32, i32, i32 } %488, 2, !dbg !49 + %492 = extractvalue { i32, i32, i32, i32 } %488, 3, !dbg !49 + %493 = bitcast i32 %489 to float, !dbg !49 + %494 = bitcast i32 %490 to float, !dbg !49 + %495 = bitcast i32 %491 to float, !dbg !49 + %496 = bitcast i32 %492 to float, !dbg !49 + %497 = tail call float @llvm.nvvm.div.full(float %463, float 1.280000e+02), !dbg !50 + %498 = tail call float @llvm.nvvm.div.full(float %464, float 1.280000e+02), !dbg !50 + %499 = tail call float @llvm.nvvm.div.full(float %465, float 1.280000e+02), !dbg !50 + %500 = tail call float @llvm.nvvm.div.full(float %466, float 1.280000e+02), !dbg !50 + %501 = tail call float @llvm.nvvm.div.full(float %473, float 1.280000e+02), !dbg !50 + %502 = tail call float @llvm.nvvm.div.full(float %474, float 1.280000e+02), !dbg !50 + %503 = tail call float @llvm.nvvm.div.full(float %475, float 1.280000e+02), !dbg !50 + %504 = tail call float @llvm.nvvm.div.full(float %476, float 1.280000e+02), !dbg !50 + %505 = tail call float @llvm.nvvm.div.full(float %483, float 1.280000e+02), !dbg !50 + %506 = tail call float @llvm.nvvm.div.full(float %484, float 1.280000e+02), !dbg !50 + %507 = tail call float @llvm.nvvm.div.full(float %485, float 1.280000e+02), !dbg !50 + %508 = tail call float @llvm.nvvm.div.full(float %486, float 1.280000e+02), !dbg !50 + %509 = tail call float @llvm.nvvm.div.full(float %493, float 1.280000e+02), !dbg !50 + %510 = tail call float @llvm.nvvm.div.full(float %494, float 1.280000e+02), !dbg !50 + %511 = tail call float @llvm.nvvm.div.full(float %495, float 1.280000e+02), !dbg !50 + %512 = tail call float @llvm.nvvm.div.full(float %496, float 1.280000e+02), !dbg !50 + %513 = fadd float %497, 0x3EB0C6F7A0000000, !dbg !51 + %514 = fadd float %498, 0x3EB0C6F7A0000000, !dbg !51 + %515 = fadd float %499, 0x3EB0C6F7A0000000, !dbg !51 + %516 = fadd float %500, 0x3EB0C6F7A0000000, !dbg !51 + %517 = fadd float %501, 0x3EB0C6F7A0000000, !dbg !51 + %518 = fadd float %502, 0x3EB0C6F7A0000000, !dbg !51 + %519 = fadd float %503, 0x3EB0C6F7A0000000, !dbg !51 + %520 = fadd float %504, 0x3EB0C6F7A0000000, !dbg !51 + %521 = fadd float %505, 0x3EB0C6F7A0000000, !dbg !51 + %522 = fadd float %506, 0x3EB0C6F7A0000000, !dbg !51 + %523 = fadd float %507, 0x3EB0C6F7A0000000, !dbg !51 + %524 = fadd float %508, 0x3EB0C6F7A0000000, !dbg !51 + %525 = fadd float %509, 0x3EB0C6F7A0000000, !dbg !51 + %526 = fadd float %510, 0x3EB0C6F7A0000000, !dbg !51 + %527 = fadd float %511, 0x3EB0C6F7A0000000, !dbg !51 + %528 = fadd float %512, 0x3EB0C6F7A0000000, !dbg !51 + %529 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i100 = icmp eq i32 %529, 0, !dbg !52 + br i1 %.not.i100, label %532, label %530, !dbg !52 + +530: ; preds = %__nv_rsqrtf.exit99 + %531 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %513), !dbg !52 + br label %__nv_rsqrtf.exit102, !dbg !52 + +532: ; preds = %__nv_rsqrtf.exit99 + %533 = tail call float @llvm.nvvm.rsqrt.approx.f(float %513), !dbg !52 + br label %__nv_rsqrtf.exit102, !dbg !52 + +__nv_rsqrtf.exit102: ; preds = %530, %532 + %.0.i101 = phi float [ %531, %530 ], [ %533, %532 ], !dbg !52 + %534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i103 = icmp eq i32 %534, 0, !dbg !52 + br i1 %.not.i103, label %537, label %535, !dbg !52 + +535: ; preds = %__nv_rsqrtf.exit102 + %536 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %514), !dbg !52 + br label %__nv_rsqrtf.exit105, !dbg !52 + +537: ; preds = %__nv_rsqrtf.exit102 + %538 = tail call float @llvm.nvvm.rsqrt.approx.f(float %514), !dbg !52 + br label %__nv_rsqrtf.exit105, !dbg !52 + +__nv_rsqrtf.exit105: ; preds = %535, %537 + %.0.i104 = phi float [ %536, %535 ], [ %538, %537 ], !dbg !52 + %539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i106 = icmp eq i32 %539, 0, !dbg !52 + br i1 %.not.i106, label %542, label %540, !dbg !52 + +540: ; preds = %__nv_rsqrtf.exit105 + %541 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %515), !dbg !52 + br label %__nv_rsqrtf.exit108, !dbg !52 + +542: ; preds = %__nv_rsqrtf.exit105 + %543 = tail call float @llvm.nvvm.rsqrt.approx.f(float %515), !dbg !52 + br label %__nv_rsqrtf.exit108, !dbg !52 + +__nv_rsqrtf.exit108: ; preds = %540, %542 + %.0.i107 = phi float [ %541, %540 ], [ %543, %542 ], !dbg !52 + %544 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i109 = icmp eq i32 %544, 0, !dbg !52 + br i1 %.not.i109, label %547, label %545, !dbg !52 + +545: ; preds = %__nv_rsqrtf.exit108 + %546 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %516), !dbg !52 + br label %__nv_rsqrtf.exit111, !dbg !52 + +547: ; preds = %__nv_rsqrtf.exit108 + %548 = tail call float @llvm.nvvm.rsqrt.approx.f(float %516), !dbg !52 + br label %__nv_rsqrtf.exit111, !dbg !52 + +__nv_rsqrtf.exit111: ; preds = %545, %547 + %.0.i110 = phi float [ %546, %545 ], [ %548, %547 ], !dbg !52 + %549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i112 = icmp eq i32 %549, 0, !dbg !52 + br i1 %.not.i112, label %552, label %550, !dbg !52 + +550: ; preds = %__nv_rsqrtf.exit111 + %551 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %517), !dbg !52 + br label %__nv_rsqrtf.exit114, !dbg !52 + +552: ; preds = %__nv_rsqrtf.exit111 + %553 = tail call float @llvm.nvvm.rsqrt.approx.f(float %517), !dbg !52 + br label %__nv_rsqrtf.exit114, !dbg !52 + +__nv_rsqrtf.exit114: ; preds = %550, %552 + %.0.i113 = phi float [ %551, %550 ], [ %553, %552 ], !dbg !52 + %554 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i115 = icmp eq i32 %554, 0, !dbg !52 + br i1 %.not.i115, label %557, label %555, !dbg !52 + +555: ; preds = %__nv_rsqrtf.exit114 + %556 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %518), !dbg !52 + br label %__nv_rsqrtf.exit117, !dbg !52 + +557: ; preds = %__nv_rsqrtf.exit114 + %558 = tail call float @llvm.nvvm.rsqrt.approx.f(float %518), !dbg !52 + br label %__nv_rsqrtf.exit117, !dbg !52 + +__nv_rsqrtf.exit117: ; preds = %555, %557 + %.0.i116 = phi float [ %556, %555 ], [ %558, %557 ], !dbg !52 + %559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i118 = icmp eq i32 %559, 0, !dbg !52 + br i1 %.not.i118, label %562, label %560, !dbg !52 + +560: ; preds = %__nv_rsqrtf.exit117 + %561 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %519), !dbg !52 + br label %__nv_rsqrtf.exit120, !dbg !52 + +562: ; preds = %__nv_rsqrtf.exit117 + %563 = tail call float @llvm.nvvm.rsqrt.approx.f(float %519), !dbg !52 + br label %__nv_rsqrtf.exit120, !dbg !52 + +__nv_rsqrtf.exit120: ; preds = %560, %562 + %.0.i119 = phi float [ %561, %560 ], [ %563, %562 ], !dbg !52 + %564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i121 = icmp eq i32 %564, 0, !dbg !52 + br i1 %.not.i121, label %567, label %565, !dbg !52 + +565: ; preds = %__nv_rsqrtf.exit120 + %566 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %520), !dbg !52 + br label %__nv_rsqrtf.exit123, !dbg !52 + +567: ; preds = %__nv_rsqrtf.exit120 + %568 = tail call float @llvm.nvvm.rsqrt.approx.f(float %520), !dbg !52 + br label %__nv_rsqrtf.exit123, !dbg !52 + +__nv_rsqrtf.exit123: ; preds = %565, %567 + %.0.i122 = phi float [ %566, %565 ], [ %568, %567 ], !dbg !52 + %569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i124 = icmp eq i32 %569, 0, !dbg !52 + br i1 %.not.i124, label %572, label %570, !dbg !52 + +570: ; preds = %__nv_rsqrtf.exit123 + %571 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %521), !dbg !52 + br label %__nv_rsqrtf.exit126, !dbg !52 + +572: ; preds = %__nv_rsqrtf.exit123 + %573 = tail call float @llvm.nvvm.rsqrt.approx.f(float %521), !dbg !52 + br label %__nv_rsqrtf.exit126, !dbg !52 + +__nv_rsqrtf.exit126: ; preds = %570, %572 + %.0.i125 = phi float [ %571, %570 ], [ %573, %572 ], !dbg !52 + %574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i127 = icmp eq i32 %574, 0, !dbg !52 + br i1 %.not.i127, label %577, label %575, !dbg !52 + +575: ; preds = %__nv_rsqrtf.exit126 + %576 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %522), !dbg !52 + br label %__nv_rsqrtf.exit129, !dbg !52 + +577: ; preds = %__nv_rsqrtf.exit126 + %578 = tail call float @llvm.nvvm.rsqrt.approx.f(float %522), !dbg !52 + br label %__nv_rsqrtf.exit129, !dbg !52 + +__nv_rsqrtf.exit129: ; preds = %575, %577 + %.0.i128 = phi float [ %576, %575 ], [ %578, %577 ], !dbg !52 + %579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i130 = icmp eq i32 %579, 0, !dbg !52 + br i1 %.not.i130, label %582, label %580, !dbg !52 + +580: ; preds = %__nv_rsqrtf.exit129 + %581 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %523), !dbg !52 + br label %__nv_rsqrtf.exit132, !dbg !52 + +582: ; preds = %__nv_rsqrtf.exit129 + %583 = tail call float @llvm.nvvm.rsqrt.approx.f(float %523), !dbg !52 + br label %__nv_rsqrtf.exit132, !dbg !52 + +__nv_rsqrtf.exit132: ; preds = %580, %582 + %.0.i131 = phi float [ %581, %580 ], [ %583, %582 ], !dbg !52 + %584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i133 = icmp eq i32 %584, 0, !dbg !52 + br i1 %.not.i133, label %587, label %585, !dbg !52 + +585: ; preds = %__nv_rsqrtf.exit132 + %586 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %524), !dbg !52 + br label %__nv_rsqrtf.exit135, !dbg !52 + +587: ; preds = %__nv_rsqrtf.exit132 + %588 = tail call float @llvm.nvvm.rsqrt.approx.f(float %524), !dbg !52 + br label %__nv_rsqrtf.exit135, !dbg !52 + +__nv_rsqrtf.exit135: ; preds = %585, %587 + %.0.i134 = phi float [ %586, %585 ], [ %588, %587 ], !dbg !52 + %589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i136 = icmp eq i32 %589, 0, !dbg !52 + br i1 %.not.i136, label %592, label %590, !dbg !52 + +590: ; preds = %__nv_rsqrtf.exit135 + %591 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %525), !dbg !52 + br label %__nv_rsqrtf.exit138, !dbg !52 + +592: ; preds = %__nv_rsqrtf.exit135 + %593 = tail call float @llvm.nvvm.rsqrt.approx.f(float %525), !dbg !52 + br label %__nv_rsqrtf.exit138, !dbg !52 + +__nv_rsqrtf.exit138: ; preds = %590, %592 + %.0.i137 = phi float [ %591, %590 ], [ %593, %592 ], !dbg !52 + %594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i139 = icmp eq i32 %594, 0, !dbg !52 + br i1 %.not.i139, label %597, label %595, !dbg !52 + +595: ; preds = %__nv_rsqrtf.exit138 + %596 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %526), !dbg !52 + br label %__nv_rsqrtf.exit141, !dbg !52 + +597: ; preds = %__nv_rsqrtf.exit138 + %598 = tail call float @llvm.nvvm.rsqrt.approx.f(float %526), !dbg !52 + br label %__nv_rsqrtf.exit141, !dbg !52 + +__nv_rsqrtf.exit141: ; preds = %595, %597 + %.0.i140 = phi float [ %596, %595 ], [ %598, %597 ], !dbg !52 + %599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i142 = icmp eq i32 %599, 0, !dbg !52 + br i1 %.not.i142, label %602, label %600, !dbg !52 + +600: ; preds = %__nv_rsqrtf.exit141 + %601 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %527), !dbg !52 + br label %__nv_rsqrtf.exit144, !dbg !52 + +602: ; preds = %__nv_rsqrtf.exit141 + %603 = tail call float @llvm.nvvm.rsqrt.approx.f(float %527), !dbg !52 + br label %__nv_rsqrtf.exit144, !dbg !52 + +__nv_rsqrtf.exit144: ; preds = %600, %602 + %.0.i143 = phi float [ %601, %600 ], [ %603, %602 ], !dbg !52 + %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i145 = icmp eq i32 %604, 0, !dbg !52 + br i1 %.not.i145, label %607, label %605, !dbg !52 + +605: ; preds = %__nv_rsqrtf.exit144 + %606 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %528), !dbg !52 + br label %__nv_rsqrtf.exit147, !dbg !52 + +607: ; preds = %__nv_rsqrtf.exit144 + %608 = tail call float @llvm.nvvm.rsqrt.approx.f(float %528), !dbg !52 + br label %__nv_rsqrtf.exit147, !dbg !52 + +__nv_rsqrtf.exit147: ; preds = %605, %607 + %.0.i146 = phi float [ %606, %605 ], [ %608, %607 ], !dbg !52 + %609 = icmp slt i32 %23, 73728, !dbg !53 + %610 = icmp slt i32 %24, 8192, !dbg !23 + %611 = extractvalue { i32, i32, i32, i32 } %380, 3, !dbg !38 + %612 = extractvalue { i32, i32, i32, i32 } %386, 3, !dbg !40 + %613 = bitcast i32 %612 to <2 x bfloat>, !dbg !40 + %614 = extractvalue { i32, i32, i32, i32 } %380, 2, !dbg !38 + %615 = extractvalue { i32, i32, i32, i32 } %380, 1, !dbg !38 + %616 = extractvalue { i32, i32, i32, i32 } %386, 2, !dbg !40 + %617 = bitcast i32 %616 to <2 x bfloat>, !dbg !40 + %618 = extractvalue { i32, i32, i32, i32 } %380, 0, !dbg !38 + %619 = extractvalue { i32, i32, i32, i32 } %378, 3, !dbg !38 + %620 = extractvalue { i32, i32, i32, i32 } %386, 1, !dbg !40 + %621 = bitcast i32 %620 to <2 x bfloat>, !dbg !40 + %622 = extractvalue { i32, i32, i32, i32 } %378, 2, !dbg !38 + %623 = extractvalue { i32, i32, i32, i32 } %378, 1, !dbg !38 + %624 = extractvalue { i32, i32, i32, i32 } %386, 0, !dbg !40 + %625 = bitcast i32 %624 to <2 x bfloat>, !dbg !40 + %626 = extractvalue { i32, i32, i32, i32 } %378, 0, !dbg !38 + %627 = extractvalue { i32, i32, i32, i32 } %379, 3, !dbg !38 + %628 = extractvalue { i32, i32, i32, i32 } %384, 3, !dbg !40 + %629 = bitcast i32 %628 to <2 x bfloat>, !dbg !40 + %630 = extractvalue { i32, i32, i32, i32 } %379, 2, !dbg !38 + %631 = extractvalue { i32, i32, i32, i32 } %379, 1, !dbg !38 + %632 = extractvalue { i32, i32, i32, i32 } %384, 2, !dbg !40 + %633 = bitcast i32 %632 to <2 x bfloat>, !dbg !40 + %634 = extractvalue { i32, i32, i32, i32 } %379, 0, !dbg !38 + %635 = extractvalue { i32, i32, i32, i32 } %376, 3, !dbg !38 + %636 = extractvalue { i32, i32, i32, i32 } %384, 1, !dbg !40 + %637 = bitcast i32 %636 to <2 x bfloat>, !dbg !40 + %638 = extractvalue { i32, i32, i32, i32 } %376, 2, !dbg !38 + %639 = extractvalue { i32, i32, i32, i32 } %376, 1, !dbg !38 + %640 = extractvalue { i32, i32, i32, i32 } %384, 0, !dbg !40 + %641 = bitcast i32 %640 to <2 x bfloat>, !dbg !40 + %642 = extractvalue { i32, i32, i32, i32 } %376, 0, !dbg !38 + %643 = shufflevector <2 x bfloat> %444, <2 x bfloat> %448, <4 x i32> , !dbg !45 + %644 = shufflevector <2 x bfloat> %445, <2 x bfloat> poison, <4 x i32> , !dbg !45 + %645 = shufflevector <4 x bfloat> %643, <4 x bfloat> %644, <4 x i32> , !dbg !45 + %646 = shufflevector <2 x bfloat> %449, <2 x bfloat> poison, <4 x i32> , !dbg !45 + %647 = shufflevector <4 x bfloat> %645, <4 x bfloat> %646, <4 x i32> , !dbg !45 + %648 = fpext <4 x bfloat> %647 to <4 x float>, !dbg !45 + %649 = insertelement <4 x float> poison, float %.0.i101, i64 0, !dbg !54 + %650 = insertelement <4 x float> %649, float %.0.i125, i64 1, !dbg !54 + %651 = insertelement <4 x float> %650, float %.0.i104, i64 2, !dbg !54 + %652 = insertelement <4 x float> %651, float %.0.i128, i64 3, !dbg !54 + %653 = fmul <4 x float> %652, %648, !dbg !54 + %654 = shufflevector <2 x bfloat> %446, <2 x bfloat> %450, <4 x i32> , !dbg !45 + %655 = shufflevector <2 x bfloat> %447, <2 x bfloat> poison, <4 x i32> , !dbg !45 + %656 = shufflevector <4 x bfloat> %654, <4 x bfloat> %655, <4 x i32> , !dbg !45 + %657 = shufflevector <2 x bfloat> %451, <2 x bfloat> poison, <4 x i32> , !dbg !45 + %658 = shufflevector <4 x bfloat> %656, <4 x bfloat> %657, <4 x i32> , !dbg !45 + %659 = fpext <4 x bfloat> %658 to <4 x float>, !dbg !45 + %660 = insertelement <4 x float> poison, float %.0.i107, i64 0, !dbg !54 + %661 = insertelement <4 x float> %660, float %.0.i131, i64 1, !dbg !54 + %662 = insertelement <4 x float> %661, float %.0.i110, i64 2, !dbg !54 + %663 = insertelement <4 x float> %662, float %.0.i134, i64 3, !dbg !54 + %664 = fmul <4 x float> %663, %659, !dbg !54 + %665 = shufflevector <2 x bfloat> %444, <2 x bfloat> %448, <4 x i32> , !dbg !45 + %666 = shufflevector <2 x bfloat> %445, <2 x bfloat> poison, <4 x i32> , !dbg !45 + %667 = shufflevector <4 x bfloat> %665, <4 x bfloat> %666, <4 x i32> , !dbg !45 + %668 = shufflevector <2 x bfloat> %449, <2 x bfloat> poison, <4 x i32> , !dbg !45 + %669 = shufflevector <4 x bfloat> %667, <4 x bfloat> %668, <4 x i32> , !dbg !45 + %670 = fpext <4 x bfloat> %669 to <4 x float>, !dbg !45 + %671 = insertelement <4 x float> poison, float %.0.i113, i64 0, !dbg !54 + %672 = insertelement <4 x float> %671, float %.0.i137, i64 1, !dbg !54 + %673 = insertelement <4 x float> %672, float %.0.i116, i64 2, !dbg !54 + %674 = insertelement <4 x float> %673, float %.0.i140, i64 3, !dbg !54 + %675 = fmul <4 x float> %674, %670, !dbg !54 + %676 = shufflevector <2 x bfloat> %446, <2 x bfloat> %450, <4 x i32> , !dbg !45 + %677 = shufflevector <2 x bfloat> %447, <2 x bfloat> poison, <4 x i32> , !dbg !45 + %678 = shufflevector <4 x bfloat> %676, <4 x bfloat> %677, <4 x i32> , !dbg !45 + %679 = shufflevector <2 x bfloat> %451, <2 x bfloat> poison, <4 x i32> , !dbg !45 + %680 = shufflevector <4 x bfloat> %678, <4 x bfloat> %679, <4 x i32> , !dbg !45 + %681 = fpext <4 x bfloat> %680 to <4 x float>, !dbg !45 + %682 = insertelement <4 x float> poison, float %.0.i119, i64 0, !dbg !54 + %683 = insertelement <4 x float> %682, float %.0.i143, i64 1, !dbg !54 + %684 = insertelement <4 x float> %683, float %.0.i122, i64 2, !dbg !54 + %685 = insertelement <4 x float> %684, float %.0.i146, i64 3, !dbg !54 + %686 = fmul <4 x float> %685, %681, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + store <4 x float> %653, ptr addrspace(3) %364, align 16, !dbg !54 + store <4 x float> %664, ptr addrspace(3) %366, align 16, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %687 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %375), !dbg !54 + %688 = extractvalue { i32, i32, i32, i32 } %687, 0, !dbg !54 + %689 = extractvalue { i32, i32, i32, i32 } %687, 1, !dbg !54 + %690 = extractvalue { i32, i32, i32, i32 } %687, 2, !dbg !54 + %691 = extractvalue { i32, i32, i32, i32 } %687, 3, !dbg !54 + %692 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) nonnull %377), !dbg !54 + %693 = extractvalue { i32, i32, i32, i32 } %692, 0, !dbg !54 + %694 = extractvalue { i32, i32, i32, i32 } %692, 1, !dbg !54 + %695 = extractvalue { i32, i32, i32, i32 } %692, 2, !dbg !54 + %696 = extractvalue { i32, i32, i32, i32 } %692, 3, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + store <4 x float> %675, ptr addrspace(3) %364, align 16, !dbg !54 + store <4 x float> %686, ptr addrspace(3) %366, align 16, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %697 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %375), !dbg !54 + %698 = extractvalue { i32, i32, i32, i32 } %697, 0, !dbg !54 + %699 = extractvalue { i32, i32, i32, i32 } %697, 1, !dbg !54 + %700 = extractvalue { i32, i32, i32, i32 } %697, 2, !dbg !54 + %701 = extractvalue { i32, i32, i32, i32 } %697, 3, !dbg !54 + %702 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) nonnull %377), !dbg !54 + %703 = extractvalue { i32, i32, i32, i32 } %702, 0, !dbg !54 + %704 = extractvalue { i32, i32, i32, i32 } %702, 1, !dbg !54 + %705 = extractvalue { i32, i32, i32, i32 } %702, 2, !dbg !54 + %706 = extractvalue { i32, i32, i32, i32 } %702, 3, !dbg !54 + %707 = getelementptr bfloat, ptr addrspace(1) %5, i64 %381, !dbg !55 + %708 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !56 + %709 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %707, i64 %708, i1 %395) #6, !dbg !56 + %710 = extractvalue { i32, i32, i32, i32 } %709, 0, !dbg !56 + %711 = bitcast i32 %710 to <2 x bfloat>, !dbg !56 + %712 = extractvalue { i32, i32, i32, i32 } %709, 1, !dbg !56 + %713 = bitcast i32 %712 to <2 x bfloat>, !dbg !56 + %714 = extractvalue { i32, i32, i32, i32 } %709, 2, !dbg !56 + %715 = bitcast i32 %714 to <2 x bfloat>, !dbg !56 + %716 = extractvalue { i32, i32, i32, i32 } %709, 3, !dbg !56 + %717 = bitcast i32 %716 to <2 x bfloat>, !dbg !56 + %718 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !56 + %719 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %707, i64 %718, i1 %398) #6, !dbg !56 + %720 = extractvalue { i32, i32, i32, i32 } %719, 0, !dbg !56 + %721 = bitcast i32 %720 to <2 x bfloat>, !dbg !56 + %722 = extractvalue { i32, i32, i32, i32 } %719, 1, !dbg !56 + %723 = bitcast i32 %722 to <2 x bfloat>, !dbg !56 + %724 = extractvalue { i32, i32, i32, i32 } %719, 2, !dbg !56 + %725 = bitcast i32 %724 to <2 x bfloat>, !dbg !56 + %726 = extractvalue { i32, i32, i32, i32 } %719, 3, !dbg !56 + %727 = bitcast i32 %726 to <2 x bfloat>, !dbg !56 + %728 = shl i32 %23, 7, !dbg !57 + %729 = shl i32 %24, 7, !dbg !57 + %730 = add i32 %728, %33, !dbg !58 + %731 = add i32 %729, %33, !dbg !58 + %732 = sext i32 %730 to i64, !dbg !59 + %733 = getelementptr bfloat, ptr addrspace(1) %6, i64 %732, !dbg !59 + %734 = sext i32 %731 to i64, !dbg !59 + %735 = getelementptr bfloat, ptr addrspace(1) %6, i64 %734, !dbg !59 + %736 = and i1 %35, %609, !dbg !60 + %737 = insertelement <2 x i32> poison, i32 %642, i64 0, !dbg !38 + %738 = insertelement <2 x i32> %737, i32 %639, i64 1, !dbg !38 + %739 = bitcast <2 x i32> %738 to <2 x float>, !dbg !38 + %740 = fpext <2 x bfloat> %641 to <2 x float>, !dbg !61 + %741 = fmul <2 x float> %739, %740, !dbg !62 + %742 = insertelement <2 x i32> poison, i32 %688, i64 0, !dbg !54 + %743 = insertelement <2 x i32> %742, i32 %689, i64 1, !dbg !54 + %744 = bitcast <2 x i32> %743 to <2 x float>, !dbg !54 + %745 = fpext <2 x bfloat> %711 to <2 x float>, !dbg !63 + %746 = fmul <2 x float> %744, %745, !dbg !64 + %747 = insertelement <2 x i1> poison, i1 %42, i64 0, !dbg !65 + %748 = shufflevector <2 x i1> %747, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !65 + %749 = select <2 x i1> %748, <2 x float> %741, <2 x float> %746, !dbg !65 + %750 = fptrunc <2 x float> %749 to <2 x bfloat>, !dbg !66 + %751 = insertelement <2 x i32> poison, i32 %638, i64 0, !dbg !38 + %752 = insertelement <2 x i32> %751, i32 %635, i64 1, !dbg !38 + %753 = bitcast <2 x i32> %752 to <2 x float>, !dbg !38 + %754 = fpext <2 x bfloat> %637 to <2 x float>, !dbg !61 + %755 = fmul <2 x float> %753, %754, !dbg !62 + %756 = insertelement <2 x i32> poison, i32 %690, i64 0, !dbg !54 + %757 = insertelement <2 x i32> %756, i32 %691, i64 1, !dbg !54 + %758 = bitcast <2 x i32> %757 to <2 x float>, !dbg !54 + %759 = fpext <2 x bfloat> %713 to <2 x float>, !dbg !63 + %760 = fmul <2 x float> %758, %759, !dbg !64 + %761 = select <2 x i1> %748, <2 x float> %755, <2 x float> %760, !dbg !65 + %762 = fptrunc <2 x float> %761 to <2 x bfloat>, !dbg !66 + %763 = insertelement <2 x i32> poison, i32 %634, i64 0, !dbg !38 + %764 = insertelement <2 x i32> %763, i32 %631, i64 1, !dbg !38 + %765 = bitcast <2 x i32> %764 to <2 x float>, !dbg !38 + %766 = fpext <2 x bfloat> %633 to <2 x float>, !dbg !61 + %767 = fmul <2 x float> %765, %766, !dbg !62 + %768 = insertelement <2 x i32> poison, i32 %698, i64 0, !dbg !54 + %769 = insertelement <2 x i32> %768, i32 %699, i64 1, !dbg !54 + %770 = bitcast <2 x i32> %769 to <2 x float>, !dbg !54 + %771 = fpext <2 x bfloat> %715 to <2 x float>, !dbg !63 + %772 = fmul <2 x float> %770, %771, !dbg !64 + %773 = select <2 x i1> %748, <2 x float> %767, <2 x float> %772, !dbg !65 + %774 = fptrunc <2 x float> %773 to <2 x bfloat>, !dbg !66 + %775 = insertelement <2 x i32> poison, i32 %630, i64 0, !dbg !38 + %776 = insertelement <2 x i32> %775, i32 %627, i64 1, !dbg !38 + %777 = bitcast <2 x i32> %776 to <2 x float>, !dbg !38 + %778 = fpext <2 x bfloat> %629 to <2 x float>, !dbg !61 + %779 = fmul <2 x float> %777, %778, !dbg !62 + %780 = insertelement <2 x i32> poison, i32 %700, i64 0, !dbg !54 + %781 = insertelement <2 x i32> %780, i32 %701, i64 1, !dbg !54 + %782 = bitcast <2 x i32> %781 to <2 x float>, !dbg !54 + %783 = fpext <2 x bfloat> %717 to <2 x float>, !dbg !63 + %784 = fmul <2 x float> %782, %783, !dbg !64 + %785 = select <2 x i1> %748, <2 x float> %779, <2 x float> %784, !dbg !65 + %786 = fptrunc <2 x float> %785 to <2 x bfloat>, !dbg !66 + %787 = insertelement <2 x i32> poison, i32 %626, i64 0, !dbg !38 + %788 = insertelement <2 x i32> %787, i32 %623, i64 1, !dbg !38 + %789 = bitcast <2 x i32> %788 to <2 x float>, !dbg !38 + %790 = fpext <2 x bfloat> %625 to <2 x float>, !dbg !61 + %791 = fmul <2 x float> %789, %790, !dbg !62 + %792 = insertelement <2 x i32> poison, i32 %693, i64 0, !dbg !54 + %793 = insertelement <2 x i32> %792, i32 %694, i64 1, !dbg !54 + %794 = bitcast <2 x i32> %793 to <2 x float>, !dbg !54 + %795 = fpext <2 x bfloat> %721 to <2 x float>, !dbg !63 + %796 = fmul <2 x float> %794, %795, !dbg !64 + %797 = insertelement <2 x i1> poison, i1 %610, i64 0, !dbg !65 + %798 = shufflevector <2 x i1> %797, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !65 + %799 = select <2 x i1> %798, <2 x float> %791, <2 x float> %796, !dbg !65 + %800 = fptrunc <2 x float> %799 to <2 x bfloat>, !dbg !66 + %801 = insertelement <2 x i32> poison, i32 %622, i64 0, !dbg !38 + %802 = insertelement <2 x i32> %801, i32 %619, i64 1, !dbg !38 + %803 = bitcast <2 x i32> %802 to <2 x float>, !dbg !38 + %804 = fpext <2 x bfloat> %621 to <2 x float>, !dbg !61 + %805 = fmul <2 x float> %803, %804, !dbg !62 + %806 = insertelement <2 x i32> poison, i32 %695, i64 0, !dbg !54 + %807 = insertelement <2 x i32> %806, i32 %696, i64 1, !dbg !54 + %808 = bitcast <2 x i32> %807 to <2 x float>, !dbg !54 + %809 = fpext <2 x bfloat> %723 to <2 x float>, !dbg !63 + %810 = fmul <2 x float> %808, %809, !dbg !64 + %811 = select <2 x i1> %798, <2 x float> %805, <2 x float> %810, !dbg !65 + %812 = fptrunc <2 x float> %811 to <2 x bfloat>, !dbg !66 + %813 = insertelement <2 x i32> poison, i32 %618, i64 0, !dbg !38 + %814 = insertelement <2 x i32> %813, i32 %615, i64 1, !dbg !38 + %815 = bitcast <2 x i32> %814 to <2 x float>, !dbg !38 + %816 = fpext <2 x bfloat> %617 to <2 x float>, !dbg !61 + %817 = fmul <2 x float> %815, %816, !dbg !62 + %818 = insertelement <2 x i32> poison, i32 %703, i64 0, !dbg !54 + %819 = insertelement <2 x i32> %818, i32 %704, i64 1, !dbg !54 + %820 = bitcast <2 x i32> %819 to <2 x float>, !dbg !54 + %821 = fpext <2 x bfloat> %725 to <2 x float>, !dbg !63 + %822 = fmul <2 x float> %820, %821, !dbg !64 + %823 = select <2 x i1> %798, <2 x float> %817, <2 x float> %822, !dbg !65 + %824 = fptrunc <2 x float> %823 to <2 x bfloat>, !dbg !66 + %825 = insertelement <2 x i32> poison, i32 %614, i64 0, !dbg !38 + %826 = insertelement <2 x i32> %825, i32 %611, i64 1, !dbg !38 + %827 = bitcast <2 x i32> %826 to <2 x float>, !dbg !38 + %828 = fpext <2 x bfloat> %613 to <2 x float>, !dbg !61 + %829 = fmul <2 x float> %827, %828, !dbg !62 + %830 = insertelement <2 x i32> poison, i32 %705, i64 0, !dbg !54 + %831 = insertelement <2 x i32> %830, i32 %706, i64 1, !dbg !54 + %832 = bitcast <2 x i32> %831 to <2 x float>, !dbg !54 + %833 = fpext <2 x bfloat> %727 to <2 x float>, !dbg !63 + %834 = fmul <2 x float> %832, %833, !dbg !64 + %835 = select <2 x i1> %798, <2 x float> %829, <2 x float> %834, !dbg !65 + %836 = fptrunc <2 x float> %835 to <2 x bfloat>, !dbg !66 + %837 = bitcast <2 x bfloat> %750 to i32, !dbg !66 + %838 = bitcast <2 x bfloat> %762 to i32, !dbg !66 + %839 = bitcast <2 x bfloat> %774 to i32, !dbg !66 + %840 = bitcast <2 x bfloat> %786 to i32, !dbg !66 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %837, i32 %838, i32 %839, i32 %840, ptr addrspace(1) %733, i1 %736) #6, !dbg !66 + %841 = bitcast <2 x bfloat> %800 to i32, !dbg !66 + %842 = bitcast <2 x bfloat> %812 to i32, !dbg !66 + %843 = bitcast <2 x bfloat> %824 to i32, !dbg !66 + %844 = bitcast <2 x bfloat> %836 to i32, !dbg !66 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %841, i32 %842, i32 %843, i32 %844, ptr addrspace(1) %735, i1 %736) #6, !dbg !66 + ret void, !dbg !67 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #3 + +; Function Attrs: nocallback nofree nounwind memory(argmem: read) +declare { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) readonly captures(none)) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { nocallback nofree nounwind memory(argmem: read) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 29, scope: !5) +!9 = !DILocation(line: 21, column: 48, scope: !5) +!10 = !DILocation(line: 21, column: 69, scope: !5) +!11 = !DILocation(line: 21, column: 53, scope: !5) +!12 = !DILocation(line: 21, column: 34, scope: !5) +!13 = !DILocation(line: 21, column: 75, scope: !5) +!14 = !DILocation(line: 22, column: 44, scope: !5) +!15 = !DILocation(line: 22, column: 23, scope: !5) +!16 = !DILocation(line: 24, column: 28, scope: !5) +!17 = !DILocation(line: 24, column: 33, scope: !5) +!18 = !DILocation(line: 25, column: 44, scope: !5) +!19 = !DILocation(line: 25, column: 23, scope: !5) +!20 = !DILocation(line: 26, column: 21, scope: !5) +!21 = !DILocation(line: 27, column: 19, scope: !5) +!22 = !DILocation(line: 29, column: 19, scope: !5) +!23 = !DILocation(line: 35, column: 18, scope: !5) +!24 = !DILocation(line: 36, column: 39, scope: !5) +!25 = !DILocation(line: 36, column: 35, scope: !5) +!26 = !DILocation(line: 36, column: 51, scope: !5) +!27 = !DILocation(line: 36, column: 44, scope: !5) +!28 = !DILocation(line: 36, column: 30, scope: !5) +!29 = !DILocation(line: 36, column: 64, scope: !5) +!30 = !DILocation(line: 36, column: 72, scope: !5) +!31 = !DILocation(line: 36, column: 57, scope: !5) +!32 = !DILocation(line: 36, column: 123, scope: !5) +!33 = !DILocation(line: 38, column: 30, scope: !5) +!34 = !DILocation(line: 38, column: 80, scope: !5) +!35 = !DILocation(line: 40, column: 19, scope: !5) +!36 = !DILocation(line: 42, column: 19, scope: !5) +!37 = !DILocation(line: 43, column: 28, scope: !5) +!38 = !DILocation(line: 44, column: 19, scope: !5) +!39 = !DILocation(line: 45, column: 31, scope: !5) +!40 = !DILocation(line: 45, column: 71, scope: !5) +!41 = !DILocation(line: 54, column: 45, scope: !5) +!42 = !DILocation(line: 54, column: 31, scope: !5) +!43 = !DILocation(line: 54, column: 83, scope: !5) +!44 = !DILocation(line: 54, column: 67, scope: !5) +!45 = !DILocation(line: 54, column: 134, scope: !5) +!46 = !DILocation(line: 56, column: 56, scope: !5) +!47 = !DILocation(line: 56, column: 52, scope: !5) +!48 = !DILocation(line: 56, column: 31, scope: !5) +!49 = !DILocation(line: 56, column: 90, scope: !5) +!50 = !DILocation(line: 58, column: 21, scope: !5) +!51 = !DILocation(line: 60, column: 20, scope: !5) +!52 = !DILocation(line: 61, column: 28, scope: !5) +!53 = !DILocation(line: 23, column: 21, scope: !5) +!54 = !DILocation(line: 62, column: 20, scope: !5) +!55 = !DILocation(line: 63, column: 31, scope: !5) +!56 = !DILocation(line: 63, column: 71, scope: !5) +!57 = !DILocation(line: 70, column: 34, scope: !5) +!58 = !DILocation(line: 70, column: 30, scope: !5) +!59 = !DILocation(line: 70, column: 25, scope: !5) +!60 = !DILocation(line: 70, column: 54, scope: !5) +!61 = !DILocation(line: 45, column: 137, scope: !5) +!62 = !DILocation(line: 47, column: 20, scope: !5) +!63 = !DILocation(line: 63, column: 138, scope: !5) +!64 = !DILocation(line: 65, column: 20, scope: !5) +!65 = !DILocation(line: 0, scope: !5) +!66 = !DILocation(line: 70, column: 46, scope: !5) +!67 = !DILocation(line: 70, column: 4, scope: !5) diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..5a427e450939597d2805f81b8a4a782ae1d609b0 --- /dev/null +++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx @@ -0,0 +1,1038 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_poi_fused__fused_rms_norm_cat_view_2 +.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2( + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10 +) +.reqntid 256 +{ + .reg .pred %p<17>; + .reg .b16 %rs<65>; + .reg .b32 %r<452>; + .reg .b64 %rd<35>; + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd27, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0]; + ld.param.b64 %rd28, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1]; +$L__tmp0: + .loc 1 21 29 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29 + mov.u32 %r74, %ctaid.y; + ld.param.b64 %rd29, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2]; + .loc 1 21 48 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48 + mov.u32 %r75, %ctaid.z; + ld.param.b64 %rd30, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3]; + .loc 1 21 69 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69 + mov.u32 %r76, %nctaid.y; + ld.param.b64 %rd31, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4]; + .loc 1 21 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34 + mad.lo.s32 %r77, %r75, %r76, %r74; + ld.param.b64 %rd32, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5]; + .loc 1 21 75 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75 + shl.b32 %r78, %r77, 8; + ld.param.b64 %rd33, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6]; + .loc 1 22 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44 + mov.u32 %r79, %tid.x; + bfe.u32 %r80, %r79, 1, 7; + shl.b32 %r81, %r79, 2; + and.b32 %r82, %r81, 252; + .loc 1 22 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23 + or.b32 %r83, %r78, %r80; + or.b32 %r84, %r83, 128; + or.b32 %r85, %r78, %r82; + .loc 1 24 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28 + mov.u32 %r86, %ctaid.x; + .loc 1 24 33 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:33 + shl.b32 %r87, %r86, 4; + .loc 1 25 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:44 + and.b32 %r88, %r79, 1; + neg.s32 %r89, %r88; + shl.b32 %r90, %r88, 3; + bfe.u32 %r91, %r79, 6, 2; + .loc 1 25 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:23 + or.b32 %r92, %r90, %r87; + or.b32 %r93, %r91, %r87; + .loc 1 26 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21 + setp.lt.s32 %p8, %r92, 128; + setp.lt.s32 %p9, %r93, 128; + .loc 1 27 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19 + bfe.s32 %r94, %r77, 23, 1; + shr.u32 %r95, %r94, 27; + add.s32 %r96, %r83, %r95; + shr.u32 %r97, %r96, 5; + add.s32 %r98, %r84, %r95; + shr.u32 %r99, %r98, 5; + .loc 1 29 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19 + and.b32 %r100, %r96, 33554400; + sub.s32 %r101, %r83, %r100; + .loc 1 35 18 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18 + setp.lt.s32 %p10, %r83, 8192; + setp.lt.s32 %p11, %r85, 8192; + .loc 1 36 39 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39 + shl.b32 %r102, %r101, 7; + .loc 1 36 35 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35 + add.s32 %r103, %r102, %r92; + .loc 1 36 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44 + mad.lo.s32 %r104, %r97, 12288, %r103; + mad.lo.s32 %r105, %r99, 12288, %r103; + .loc 1 36 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30 + mad.wide.s32 %rd1, %r104, 2, %rd27; + mad.wide.s32 %rd3, %r105, 2, %rd27; + .loc 1 36 64 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64 + and.pred %p1, %p8, %p10; + and.pred %p3, %p9, %p11; + .loc 1 36 72 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:72 + setp.lt.s32 %p12, %r83, 8064; + and.pred %p2, %p8, %p12; + .loc 1 36 57 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + prmt.b32 %r106, %r1, %r3, 0x7632U; + prmt.b32 %r107, %r2, %r4, 0x7632U; + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4; + // end inline asm + prmt.b32 %r108, %r6, %r8, 0x7632U; + prmt.b32 %r109, %r7, %r9, 0x7632U; + .loc 1 36 123 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123 + and.b32 %r110, %r79, 6; + and.b32 %r111, %r79, 120; + shl.b32 %r112, %r88, 2; + bfe.s32 %r113, %r79, 7, 1; + and.b32 %r114, %r113, 4100; + mul.lo.s32 %r115, %r110, 528; + or.b32 %r116, %r112, %r111; + xor.b32 %r117, %r115, %r116; + xor.b32 %r118, %r117, %r114; + mov.b32 %r119, global_smem; + add.s32 %r120, %r119, %r118; + prmt.b32 %r121, %r1, %r3, 0x5410U; + st.shared.b32 [%r120], %r121; + st.shared.b32 [%r120+256], %r106; + prmt.b32 %r122, %r2, %r4, 0x5410U; + st.shared.b32 [%r120+512], %r122; + st.shared.b32 [%r120+768], %r107; + prmt.b32 %r123, %r6, %r8, 0x5410U; + st.shared.b32 [%r120+128], %r123; + st.shared.b32 [%r120+384], %r108; + prmt.b32 %r124, %r7, %r9, 0x5410U; + st.shared.b32 [%r120+640], %r124; + st.shared.b32 [%r120+896], %r109; + bar.sync 0; + shl.b32 %r125, %r79, 3; + and.b32 %r126, %r125, 120; + and.b32 %r127, %r79, 224; + shl.b32 %r128, %r127, 2; + bfe.s32 %r129, %r79, 4, 1; + and.b32 %r130, %r129, 4100; + or.b32 %r131, %r130, %r128; + or.b32 %r132, %r131, %r126; + add.s32 %r133, %r119, %r132; + ld.shared.v2.b16 {%rs1, %rs2}, [%r133]; + xor.b32 %r134, %r132, 32; + add.s32 %r135, %r119, %r134; + ld.shared.v2.b16 {%rs3, %rs4}, [%r135+1024]; + xor.b32 %r136, %r132, 64; + add.s32 %r137, %r119, %r136; + ld.shared.v2.b16 {%rs5, %rs6}, [%r137+2048]; + xor.b32 %r138, %r132, 96; + add.s32 %r139, %r119, %r138; + ld.shared.v2.b16 {%rs7, %rs8}, [%r139+3072]; + xor.b32 %r140, %r132, 4; + add.s32 %r141, %r119, %r140; + ld.shared.v2.b16 {%rs9, %rs10}, [%r141]; + xor.b32 %r142, %r132, 36; + add.s32 %r143, %r119, %r142; + ld.shared.v2.b16 {%rs11, %rs12}, [%r143+1024]; + xor.b32 %r144, %r132, 68; + add.s32 %r145, %r119, %r144; + ld.shared.v2.b16 {%rs13, %rs14}, [%r145+2048]; + xor.b32 %r146, %r132, 100; + add.s32 %r147, %r119, %r146; + ld.shared.v2.b16 {%rs15, %rs16}, [%r147+3072]; + cvt.f32.bf16 %r148, %rs9; + cvt.f32.bf16 %r149, %rs11; + cvt.f32.bf16 %r150, %rs1; + cvt.f32.bf16 %r151, %rs3; + cvt.f32.bf16 %r152, %rs13; + cvt.f32.bf16 %r153, %rs15; + cvt.f32.bf16 %r154, %rs5; + cvt.f32.bf16 %r155, %rs7; + cvt.f32.bf16 %r156, %rs10; + cvt.f32.bf16 %r157, %rs12; + cvt.f32.bf16 %r158, %rs2; + cvt.f32.bf16 %r159, %rs4; + cvt.f32.bf16 %r160, %rs14; + cvt.f32.bf16 %r161, %rs16; + cvt.f32.bf16 %r162, %rs6; + cvt.f32.bf16 %r163, %rs8; + .loc 1 38 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30 + mad.wide.s32 %rd5, %r85, 4, %rd28; + .loc 1 38 80 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + mov.u32 %r13, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6; + // end inline asm + // begin inline asm + mov.u64 %rd7, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r14, %r5; + mov.u32 %r15, %r5; + mov.u32 %r16, %r5; + mov.u32 %r17, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd5 + 0 ], %rd7; + // end inline asm + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r18, %r5; + mov.u32 %r19, %r5; + mov.u32 %r20, %r5; + mov.u32 %r21, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd5 + 0 ], %rd8; + // end inline asm + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r22, %r5; + mov.u32 %r23, %r5; + mov.u32 %r24, %r5; + mov.u32 %r25, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r22, %r23, %r24, %r25 }, [ %rd5 + 0 ], %rd9; + // end inline asm + mov.b32 %r164, 0f43000000; + .loc 1 40 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19 + div.full.f32 %r165, %r10, %r164; + div.full.f32 %r166, %r11, %r164; + div.full.f32 %r167, %r12, %r164; + div.full.f32 %r168, %r13, %r164; + div.full.f32 %r169, %r14, %r164; + div.full.f32 %r170, %r15, %r164; + div.full.f32 %r171, %r16, %r164; + div.full.f32 %r172, %r17, %r164; + div.full.f32 %r173, %r18, %r164; + div.full.f32 %r174, %r19, %r164; + div.full.f32 %r175, %r20, %r164; + div.full.f32 %r176, %r21, %r164; + div.full.f32 %r177, %r22, %r164; + div.full.f32 %r178, %r23, %r164; + div.full.f32 %r179, %r24, %r164; + div.full.f32 %r180, %r25, %r164; + .loc 1 42 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19 + add.f32 %r181, %r165, 0f358637BD; + add.f32 %r182, %r166, 0f358637BD; + add.f32 %r183, %r167, 0f358637BD; + add.f32 %r184, %r168, 0f358637BD; + add.f32 %r185, %r169, 0f358637BD; + add.f32 %r186, %r170, 0f358637BD; + add.f32 %r187, %r171, 0f358637BD; + add.f32 %r188, %r172, 0f358637BD; + add.f32 %r189, %r173, 0f358637BD; + add.f32 %r190, %r174, 0f358637BD; + add.f32 %r191, %r175, 0f358637BD; + add.f32 %r192, %r176, 0f358637BD; + add.f32 %r193, %r177, 0f358637BD; + add.f32 %r194, %r178, 0f358637BD; + add.f32 %r195, %r179, 0f358637BD; + add.f32 %r196, %r180, 0f358637BD; + .loc 1 43 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28 + rsqrt.approx.ftz.f32 %r197, %r181; + rsqrt.approx.ftz.f32 %r198, %r182; + rsqrt.approx.ftz.f32 %r199, %r183; + rsqrt.approx.ftz.f32 %r200, %r184; + rsqrt.approx.ftz.f32 %r201, %r185; + rsqrt.approx.ftz.f32 %r202, %r186; + rsqrt.approx.ftz.f32 %r203, %r187; + rsqrt.approx.ftz.f32 %r204, %r188; + rsqrt.approx.ftz.f32 %r205, %r189; + rsqrt.approx.ftz.f32 %r206, %r190; + rsqrt.approx.ftz.f32 %r207, %r191; + rsqrt.approx.ftz.f32 %r208, %r192; + rsqrt.approx.ftz.f32 %r209, %r193; + rsqrt.approx.ftz.f32 %r210, %r194; + rsqrt.approx.ftz.f32 %r211, %r195; + rsqrt.approx.ftz.f32 %r212, %r196; + .loc 1 44 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19 + mul.f32 %r213, %r198, %r151; + mul.f32 %r214, %r197, %r150; + mul.f32 %r215, %r206, %r149; + mul.f32 %r216, %r205, %r148; + mul.f32 %r217, %r200, %r155; + mul.f32 %r218, %r199, %r154; + mul.f32 %r219, %r208, %r153; + mul.f32 %r220, %r207, %r152; + mul.f32 %r221, %r202, %r159; + mul.f32 %r222, %r201, %r158; + mul.f32 %r223, %r210, %r157; + mul.f32 %r224, %r209, %r156; + mul.f32 %r225, %r204, %r163; + mul.f32 %r226, %r203, %r162; + mul.f32 %r227, %r212, %r161; + mul.f32 %r228, %r211, %r160; + bar.sync 0; + shl.b32 %r229, %r79, 4; + and.b32 %r230, %r229, 4080; + add.s32 %r231, %r119, %r230; + st.shared.v4.b32 [%r231], {%r214, %r216, %r213, %r215}; + xor.b32 %r232, %r230, 64; + add.s32 %r233, %r119, %r232; + st.shared.v4.b32 [%r233+4096], {%r218, %r220, %r217, %r219}; + bar.sync 0; + shl.b32 %r234, %r79, 7; + and.b32 %r235, %r234, 3072; + shl.b32 %r236, %r110, 3; + shl.b32 %r237, %r127, 1; + and.b32 %r238, %r89, 4160; + xor.b32 %r239, %r238, %r237; + add.s32 %r240, %r119, %r235; + add.s32 %r241, %r240, %r236; + add.s32 %r242, %r241, %r239; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r243, %r244, %r245, %r246}, [%r242]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r247, %r248, %r249, %r250}, [%r242+512]; + bar.sync 0; + st.shared.v4.b32 [%r231], {%r222, %r224, %r221, %r223}; + st.shared.v4.b32 [%r233+4096], {%r226, %r228, %r225, %r227}; + bar.sync 0; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r251, %r252, %r253, %r254}, [%r242]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r255, %r256, %r257, %r258}, [%r242+512]; + .loc 1 45 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31 + mul.wide.s32 %rd34, %r92, 2; + add.s64 %rd10, %rd29, %rd34; + .loc 1 45 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + mov.u32 %r29, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd10 + 0 ], %rd11; + // end inline asm + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r30, %r5; + mov.u32 %r31, %r5; + mov.u32 %r32, %r5; + mov.u32 %r33, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd10 + 0 ], %rd12; + // end inline asm + .loc 1 54 45 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45 + add.s32 %r259, %r104, -3145728; + add.s32 %r260, %r105, -3145728; + .loc 1 54 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31 + mad.wide.s32 %rd13, %r259, 2, %rd30; + mad.wide.s32 %rd15, %r260, 2, %rd30; + .loc 1 54 83 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83 + add.s32 %r261, %r78, -8192; + setp.lt.u32 %p13, %r261, 65536; + and.pred %p4, %p8, %p13; + add.s32 %r262, %r78, -8064; + setp.lt.u32 %p14, %r262, 65664; + and.pred %p5, %p8, %p14; + and.pred %p6, %p9, %p13; + .loc 1 54 67 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67 + // begin inline asm + mov.u64 %rd14, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r34, %r5; + mov.u32 %r35, %r5; + mov.u32 %r36, %r5; + mov.u32 %r37, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd13 + 0 ], %rd14; + // end inline asm + prmt.b32 %r263, %r34, %r36, 0x7632U; + prmt.b32 %r264, %r35, %r37, 0x7632U; + // begin inline asm + mov.u64 %rd16, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r38, %r5; + mov.u32 %r39, %r5; + mov.u32 %r40, %r5; + mov.u32 %r41, %r5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r38, %r39, %r40, %r41 }, [ %rd15 + 0 ], %rd16; + // end inline asm + prmt.b32 %r265, %r38, %r40, 0x7632U; + prmt.b32 %r266, %r39, %r41, 0x7632U; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + bar.sync 0; + prmt.b32 %r267, %r34, %r36, 0x5410U; + st.shared.b32 [%r120], %r267; + st.shared.b32 [%r120+256], %r263; + prmt.b32 %r268, %r35, %r37, 0x5410U; + st.shared.b32 [%r120+512], %r268; + st.shared.b32 [%r120+768], %r264; + prmt.b32 %r269, %r38, %r40, 0x5410U; + st.shared.b32 [%r120+128], %r269; + st.shared.b32 [%r120+384], %r265; + prmt.b32 %r270, %r39, %r41, 0x5410U; + st.shared.b32 [%r120+640], %r270; + st.shared.b32 [%r120+896], %r266; + bar.sync 0; + ld.shared.v2.b16 {%rs17, %rs18}, [%r133]; + ld.shared.v2.b16 {%rs19, %rs20}, [%r135+1024]; + ld.shared.v2.b16 {%rs21, %rs22}, [%r137+2048]; + ld.shared.v2.b16 {%rs23, %rs24}, [%r139+3072]; + ld.shared.v2.b16 {%rs25, %rs26}, [%r141]; + ld.shared.v2.b16 {%rs27, %rs28}, [%r143+1024]; + ld.shared.v2.b16 {%rs29, %rs30}, [%r145+2048]; + ld.shared.v2.b16 {%rs31, %rs32}, [%r147+3072]; + .loc 1 56 52 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52 + add.s32 %r271, %r85, -8192; + .loc 1 56 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31 + mad.wide.s32 %rd17, %r271, 4, %rd31; + .loc 1 56 90 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r42, %r5; + mov.u32 %r43, %r5; + mov.u32 %r44, %r5; + mov.u32 %r45, %r5; + @%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd17 + 0 ], %rd18; + // end inline asm + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r46, %r5; + mov.u32 %r47, %r5; + mov.u32 %r48, %r5; + mov.u32 %r49, %r5; + @%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd17 + 0 ], %rd19; + // end inline asm + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r50, %r5; + mov.u32 %r51, %r5; + mov.u32 %r52, %r5; + mov.u32 %r53, %r5; + @%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd17 + 0 ], %rd20; + // end inline asm + // begin inline asm + mov.u64 %rd21, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r54, %r5; + mov.u32 %r55, %r5; + mov.u32 %r56, %r5; + mov.u32 %r57, %r5; + @%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd17 + 0 ], %rd21; + // end inline asm + .loc 1 58 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21 + div.full.f32 %r272, %r42, %r164; + div.full.f32 %r273, %r43, %r164; + div.full.f32 %r274, %r44, %r164; + div.full.f32 %r275, %r45, %r164; + div.full.f32 %r276, %r46, %r164; + div.full.f32 %r277, %r47, %r164; + div.full.f32 %r278, %r48, %r164; + div.full.f32 %r279, %r49, %r164; + div.full.f32 %r280, %r50, %r164; + div.full.f32 %r281, %r51, %r164; + div.full.f32 %r282, %r52, %r164; + div.full.f32 %r283, %r53, %r164; + div.full.f32 %r284, %r54, %r164; + div.full.f32 %r285, %r55, %r164; + div.full.f32 %r286, %r56, %r164; + div.full.f32 %r287, %r57, %r164; + .loc 1 60 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20 + add.f32 %r288, %r272, 0f358637BD; + add.f32 %r289, %r273, 0f358637BD; + add.f32 %r290, %r274, 0f358637BD; + add.f32 %r291, %r275, 0f358637BD; + add.f32 %r292, %r276, 0f358637BD; + add.f32 %r293, %r277, 0f358637BD; + add.f32 %r294, %r278, 0f358637BD; + add.f32 %r295, %r279, 0f358637BD; + add.f32 %r296, %r280, 0f358637BD; + add.f32 %r297, %r281, 0f358637BD; + add.f32 %r298, %r282, 0f358637BD; + add.f32 %r299, %r283, 0f358637BD; + add.f32 %r300, %r284, 0f358637BD; + add.f32 %r301, %r285, 0f358637BD; + add.f32 %r302, %r286, 0f358637BD; + add.f32 %r303, %r287, 0f358637BD; + .loc 1 61 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28 + rsqrt.approx.ftz.f32 %r304, %r288; + rsqrt.approx.ftz.f32 %r305, %r289; + rsqrt.approx.ftz.f32 %r306, %r290; + rsqrt.approx.ftz.f32 %r307, %r291; + rsqrt.approx.ftz.f32 %r308, %r292; + rsqrt.approx.ftz.f32 %r309, %r293; + rsqrt.approx.ftz.f32 %r310, %r294; + rsqrt.approx.ftz.f32 %r311, %r295; + rsqrt.approx.ftz.f32 %r312, %r296; + rsqrt.approx.ftz.f32 %r313, %r297; + rsqrt.approx.ftz.f32 %r314, %r298; + rsqrt.approx.ftz.f32 %r315, %r299; + rsqrt.approx.ftz.f32 %r316, %r300; + rsqrt.approx.ftz.f32 %r317, %r301; + rsqrt.approx.ftz.f32 %r318, %r302; + rsqrt.approx.ftz.f32 %r319, %r303; + .loc 1 23 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21 + setp.lt.s32 %p15, %r83, 73728; + .loc 1 35 18 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18 + setp.lt.s32 %p16, %r84, 8192; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + cvt.f32.bf16 %r320, %rs25; + cvt.f32.bf16 %r321, %rs27; + cvt.f32.bf16 %r322, %rs17; + cvt.f32.bf16 %r323, %rs19; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r324, %r305, %r323; + mul.f32 %r325, %r304, %r322; + mul.f32 %r326, %r313, %r321; + mul.f32 %r327, %r312, %r320; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + cvt.f32.bf16 %r328, %rs29; + cvt.f32.bf16 %r329, %rs31; + cvt.f32.bf16 %r330, %rs21; + cvt.f32.bf16 %r331, %rs23; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r332, %r307, %r331; + mul.f32 %r333, %r306, %r330; + mul.f32 %r334, %r315, %r329; + mul.f32 %r335, %r314, %r328; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + cvt.f32.bf16 %r336, %rs26; + cvt.f32.bf16 %r337, %rs28; + cvt.f32.bf16 %r338, %rs18; + cvt.f32.bf16 %r339, %rs20; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r340, %r309, %r339; + mul.f32 %r341, %r308, %r338; + mul.f32 %r342, %r317, %r337; + mul.f32 %r343, %r316, %r336; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + cvt.f32.bf16 %r344, %rs30; + cvt.f32.bf16 %r345, %rs32; + cvt.f32.bf16 %r346, %rs22; + cvt.f32.bf16 %r347, %rs24; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r348, %r311, %r347; + mul.f32 %r349, %r310, %r346; + mul.f32 %r350, %r319, %r345; + mul.f32 %r351, %r318, %r344; + bar.sync 0; + st.shared.v4.b32 [%r231], {%r325, %r327, %r324, %r326}; + st.shared.v4.b32 [%r233+4096], {%r333, %r335, %r332, %r334}; + bar.sync 0; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r352, %r353, %r354, %r355}, [%r242]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r356, %r357, %r358, %r359}, [%r242+512]; + bar.sync 0; + st.shared.v4.b32 [%r231], {%r341, %r343, %r340, %r342}; + st.shared.v4.b32 [%r233+4096], {%r349, %r351, %r348, %r350}; + bar.sync 0; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r360, %r361, %r362, %r363}, [%r242]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r364, %r365, %r366, %r367}, [%r242+512]; + .loc 1 63 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31 + add.s64 %rd22, %rd32, %rd34; + .loc 1 63 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r58, %r5; + mov.u32 %r59, %r5; + mov.u32 %r60, %r5; + mov.u32 %r61, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd22 + 0 ], %rd23; + // end inline asm + // begin inline asm + mov.u64 %rd24, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r62, %r5; + mov.u32 %r63, %r5; + mov.u32 %r64, %r5; + mov.u32 %r65, %r5; + @%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r62, %r63, %r64, %r65 }, [ %rd22 + 0 ], %rd24; + // end inline asm + .loc 1 70 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34 + shl.b32 %r368, %r83, 7; + shl.b32 %r369, %r84, 7; + .loc 1 70 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30 + add.s32 %r370, %r368, %r92; + add.s32 %r371, %r369, %r92; + .loc 1 70 25 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25 + mad.wide.s32 %rd25, %r370, 2, %rd33; + mad.wide.s32 %rd26, %r371, 2, %rd33; + .loc 1 70 54 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54 + and.pred %p7, %p8, %p15; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs33, %rs34}, %r26; + cvt.f32.bf16 %r372, %rs33; + cvt.f32.bf16 %r373, %rs34; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r374, %r244, %r373; + mul.f32 %r375, %r243, %r372; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs35, %rs36}, %r58; + cvt.f32.bf16 %r376, %rs35; + cvt.f32.bf16 %r377, %rs36; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r378, %r353, %r377; + mul.f32 %r379, %r352, %r376; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r380, %r375, %r379, %p10; + selp.f32 %r381, %r374, %r378, %p10; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r66, %r381, %r380; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs37, %rs38}, %r27; + cvt.f32.bf16 %r382, %rs37; + cvt.f32.bf16 %r383, %rs38; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r384, %r246, %r383; + mul.f32 %r385, %r245, %r382; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs39, %rs40}, %r59; + cvt.f32.bf16 %r386, %rs39; + cvt.f32.bf16 %r387, %rs40; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r388, %r355, %r387; + mul.f32 %r389, %r354, %r386; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r390, %r385, %r389, %p10; + selp.f32 %r391, %r384, %r388, %p10; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r67, %r391, %r390; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs41, %rs42}, %r28; + cvt.f32.bf16 %r392, %rs41; + cvt.f32.bf16 %r393, %rs42; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r394, %r252, %r393; + mul.f32 %r395, %r251, %r392; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs43, %rs44}, %r60; + cvt.f32.bf16 %r396, %rs43; + cvt.f32.bf16 %r397, %rs44; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r398, %r361, %r397; + mul.f32 %r399, %r360, %r396; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r400, %r395, %r399, %p10; + selp.f32 %r401, %r394, %r398, %p10; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r68, %r401, %r400; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs45, %rs46}, %r29; + cvt.f32.bf16 %r402, %rs45; + cvt.f32.bf16 %r403, %rs46; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r404, %r254, %r403; + mul.f32 %r405, %r253, %r402; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs47, %rs48}, %r61; + cvt.f32.bf16 %r406, %rs47; + cvt.f32.bf16 %r407, %rs48; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r408, %r363, %r407; + mul.f32 %r409, %r362, %r406; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r410, %r405, %r409, %p10; + selp.f32 %r411, %r404, %r408, %p10; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r69, %r411, %r410; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs49, %rs50}, %r30; + cvt.f32.bf16 %r412, %rs49; + cvt.f32.bf16 %r413, %rs50; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r414, %r248, %r413; + mul.f32 %r415, %r247, %r412; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs51, %rs52}, %r62; + cvt.f32.bf16 %r416, %rs51; + cvt.f32.bf16 %r417, %rs52; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r418, %r357, %r417; + mul.f32 %r419, %r356, %r416; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r420, %r415, %r419, %p16; + selp.f32 %r421, %r414, %r418, %p16; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r70, %r421, %r420; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs53, %rs54}, %r31; + cvt.f32.bf16 %r422, %rs53; + cvt.f32.bf16 %r423, %rs54; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r424, %r250, %r423; + mul.f32 %r425, %r249, %r422; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs55, %rs56}, %r63; + cvt.f32.bf16 %r426, %rs55; + cvt.f32.bf16 %r427, %rs56; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r428, %r359, %r427; + mul.f32 %r429, %r358, %r426; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r430, %r425, %r429, %p16; + selp.f32 %r431, %r424, %r428, %p16; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r71, %r431, %r430; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs57, %rs58}, %r32; + cvt.f32.bf16 %r432, %rs57; + cvt.f32.bf16 %r433, %rs58; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r434, %r256, %r433; + mul.f32 %r435, %r255, %r432; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs59, %rs60}, %r64; + cvt.f32.bf16 %r436, %rs59; + cvt.f32.bf16 %r437, %rs60; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r438, %r365, %r437; + mul.f32 %r439, %r364, %r436; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r440, %r435, %r439, %p16; + selp.f32 %r441, %r434, %r438, %p16; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r72, %r441, %r440; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs61, %rs62}, %r33; + cvt.f32.bf16 %r442, %rs61; + cvt.f32.bf16 %r443, %rs62; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r444, %r258, %r443; + mul.f32 %r445, %r257, %r442; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs63, %rs64}, %r65; + cvt.f32.bf16 %r446, %rs63; + cvt.f32.bf16 %r447, %rs64; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r448, %r367, %r447; + mul.f32 %r449, %r366, %r446; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r450, %r445, %r449, %p16; + selp.f32 %r451, %r444, %r448, %p16; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r73, %r451, %r450; + // begin inline asm + @%p7 st.global.v4.b32 [ %rd25 + 0 ], { %r66, %r67, %r68, %r69 }; + // end inline asm + // begin inline asm + @%p7 st.global.v4.b32 [ %rd26 + 0 ], { %r70, %r71, %r72, %r73 }; + // end inline asm + .loc 1 70 4 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 50 +.b8 104 +.b8 105 +.b8 106 +.b8 51 +.b8 104 +.b8 109 +.b8 108 +.b8 111 +.b8 117 +.b8 109 +.b8 120 +.b8 100 +.b8 109 +.b8 104 +.b8 117 +.b8 101 +.b8 122 +.b8 115 +.b8 121 +.b8 104 +.b8 107 +.b8 109 +.b8 110 +.b8 113 +.b8 103 +.b8 110 +.b8 102 +.b8 97 +.b8 53 +.b8 105 +.b8 118 +.b8 114 +.b8 101 +.b8 50 +.b8 55 +.b8 117 +.b8 111 +.b8 115 +.b8 121 +.b8 109 +.b8 97 +.b8 109 +.b8 51 +.b8 100 +.b8 114 +.b8 55 +.b8 97 +.b8 53 +.b8 120 +.b8 98 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 50 +.b8 104 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.source new file mode 100644 index 0000000000000000000000000000000000000000..86e111c99a00db61380325e65ed8aadb64550e9d --- /dev/null +++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.source @@ -0,0 +1,415 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc99 = loc("in_ptr0"(#loc)) +#loc100 = loc("in_ptr1"(#loc)) +#loc101 = loc("in_ptr2"(#loc)) +#loc102 = loc("in_ptr3"(#loc)) +#loc103 = loc("in_ptr4"(#loc)) +#loc104 = loc("in_ptr5"(#loc)) +#loc105 = loc("out_ptr0"(#loc)) +#loc106 = loc("ynumel"(#loc)) +#loc107 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %ynumel_0 = arith.constant 73728 : i32 loc(#loc108) + %xnumel_1 = arith.constant 128 : i32 loc(#loc109) + %yoffset = tt.get_program_id y : i32 loc(#loc110) + %yoffset_2 = tt.get_program_id z : i32 loc(#loc111) + %yoffset_3 = tt.get_num_programs y : i32 loc(#loc112) + %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc113) + %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc114) + %yoffset_6 = arith.constant 256 : i32 loc(#loc115) + %yoffset_7 = arith.constant 256 : i32 loc(#loc115) + %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc115) + %yindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc116) + %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<256xi32> -> tensor<256x1xi32> loc(#loc117) + %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<256x1xi32> loc(#loc118) + %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<256x1xi32> loc(#loc118) + %ymask = arith.constant dense<73728> : tensor<256x1xi32> loc(#loc119) + %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<256x1xi32> loc(#loc119) + %xoffset = tt.get_program_id x : i32 loc(#loc120) + %xoffset_13 = arith.constant 16 : i32 loc(#loc121) + %xoffset_14 = arith.constant 16 : i32 loc(#loc121) + %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc121) + %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc122) + %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc123) + %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x16xi32> loc(#loc124) + %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x16xi32> loc(#loc124) + %xmask = arith.constant dense<128> : tensor<1x16xi32> loc(#loc125) + %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x16xi32> loc(#loc125) + %y1 = arith.constant 32 : i32 loc(#loc126) + %y1_20 = arith.constant 32 : i32 loc(#loc126) + %y1_21 = arith.constant dense<32> : tensor<256x1xi32> loc(#loc126) + %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<256x1xi32> loc(#loc126) + %y0 = arith.constant 32 : i32 loc(#loc127) + %y0_23 = arith.constant 32 : i32 loc(#loc127) + %y0_24 = arith.constant dense<32> : tensor<256x1xi32> loc(#loc127) + %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<256x1xi32> loc(#loc127) + %tmp1 = arith.constant 0 : i64 loc(#loc128) + %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc128) + %tmp2 = arith.extsi %y1_22 : tensor<256x1xi32> to tensor<256x1xi64> loc(#loc129) + %tmp2_27 = arith.constant dense<0> : tensor<256x1xi64> loc(#loc129) + %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<256x1xi64> loc(#loc129) + %tmp3 = arith.constant 256 : i64 loc(#loc130) + %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc130) + %tmp4 = arith.extsi %y1_22 : tensor<256x1xi32> to tensor<256x1xi64> loc(#loc131) + %tmp4_30 = arith.constant dense<256> : tensor<256x1xi64> loc(#loc131) + %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<256x1xi64> loc(#loc131) + %tmp5 = arith.constant 128 : i32 loc(#loc132) + %tmp5_32 = arith.constant 128 : i32 loc(#loc132) + %tmp5_33 = arith.constant dense<128> : tensor<256x1xi32> loc(#loc132) + %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<256x1xi32> loc(#loc132) + %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc133) + %tmp5_36 = tt.broadcast %tmp5_34 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc133) + %tmp5_37 = arith.addi %tmp5_35, %tmp5_36 : tensor<256x16xi32> loc(#loc133) + %tmp5_38 = arith.constant 12288 : i32 loc(#loc134) + %tmp5_39 = arith.constant 12288 : i32 loc(#loc134) + %tmp5_40 = arith.constant dense<12288> : tensor<256x1xi32> loc(#loc134) + %tmp5_41 = arith.muli %tmp5_40, %y1_22 : tensor<256x1xi32> loc(#loc134) + %tmp5_42 = tt.broadcast %tmp5_41 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc135) + %tmp5_43 = arith.addi %tmp5_37, %tmp5_42 : tensor<256x16xi32> loc(#loc135) + %tmp5_44 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x16x!tt.ptr> loc(#loc136) + %tmp5_45 = tt.addptr %tmp5_44, %tmp5_43 : tensor<256x16x!tt.ptr>, tensor<256x16xi32> loc(#loc136) + %tmp5_46 = tt.broadcast %tmp4_31 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc137) + %tmp5_47 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc137) + %tmp5_48 = arith.andi %tmp5_46, %tmp5_47 : tensor<256x16xi1> loc(#loc137) + %tmp5_49 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc138) + %tmp5_50 = arith.andi %tmp5_48, %tmp5_49 : tensor<256x16xi1> loc(#loc138) + %tmp5_51 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp5_52 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc139) + %tmp5_53 = arith.truncf %tmp5_52 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc139) + %tmp5_54 = tt.load %tmp5_45, %tmp5_50, %tmp5_53 evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc139) + %tmp5_55 = arith.extf %tmp5_54 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc140) + %tmp7 = arith.constant 32 : i32 loc(#loc141) + %tmp7_56 = arith.constant 32 : i32 loc(#loc141) + %tmp7_57 = arith.constant dense<32> : tensor<256x1xi32> loc(#loc141) + %tmp7_58 = arith.muli %tmp7_57, %y1_22 : tensor<256x1xi32> loc(#loc141) + %tmp7_59 = arith.addi %y0_25, %tmp7_58 : tensor<256x1xi32> loc(#loc142) + %tmp7_60 = tt.broadcast %tmp7_59 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc143) + %tmp7_61 = tt.splat %in_ptr1 : !tt.ptr -> tensor<256x16x!tt.ptr> loc(#loc144) + %tmp7_62 = tt.addptr %tmp7_61, %tmp7_60 : tensor<256x16x!tt.ptr>, tensor<256x16xi32> loc(#loc144) + %tmp7_63 = tt.broadcast %tmp4_31 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc145) + %tmp7_64 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc145) + %tmp7_65 = arith.andi %tmp7_63, %tmp7_64 : tensor<256x16xi1> loc(#loc145) + %tmp7_66 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc146) + %tmp7_67 = arith.andi %tmp7_65, %tmp7_66 : tensor<256x16xi1> loc(#loc146) + %tmp7_68 = arith.constant 0.000000e+00 : f32 loc(#loc147) + %tmp7_69 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc147) + %tmp7_70 = tt.load %tmp7_62, %tmp7_67, %tmp7_69 evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc147) + %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc148) + %tmp9 = arith.constant dense<1.280000e+02> : tensor<256x16xf32> loc(#loc149) + %tmp9_71 = arith.divf %tmp7_70, %tmp9 : tensor<256x16xf32> loc(#loc149) + %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc150) + %tmp11 = arith.constant dense<9.99999997E-7> : tensor<256x16xf32> loc(#loc151) + %tmp11_72 = arith.addf %tmp9_71, %tmp11 : tensor<256x16xf32> loc(#loc151) + %tmp12 = tt.extern_elementwise %tmp11_72 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32>) -> tensor<256x16xf32> loc(#loc152) + %tmp13 = arith.mulf %tmp5_55, %tmp12 : tensor<256x16xf32> loc(#loc153) + %tmp14 = tt.broadcast %xindex_18 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc154) + %tmp14_73 = tt.splat %in_ptr2 : !tt.ptr -> tensor<256x16x!tt.ptr> loc(#loc155) + %tmp14_74 = tt.addptr %tmp14_73, %tmp14 : tensor<256x16x!tt.ptr>, tensor<256x16xi32> loc(#loc155) + %tmp14_75 = tt.broadcast %tmp4_31 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc156) + %tmp14_76 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc156) + %tmp14_77 = arith.andi %tmp14_75, %tmp14_76 : tensor<256x16xi1> loc(#loc156) + %tmp14_78 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc157) + %tmp14_79 = arith.andi %tmp14_77, %tmp14_78 : tensor<256x16xi1> loc(#loc157) + %tmp14_80 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp14_81 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc158) + %tmp14_82 = arith.truncf %tmp14_81 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc158) + %tmp14_83 = tt.load %tmp14_74, %tmp14_79, %tmp14_82 evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc158) + %tmp14_84 = arith.extf %tmp14_83 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc159) + %tmp16 = arith.mulf %tmp13, %tmp14_84 : tensor<256x16xf32> loc(#loc160) + %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc161) + %tmp18_85 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc161) + %tmp19 = tt.broadcast %tmp4_31 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc162) + %tmp19_86 = arith.select %tmp19, %tmp16, %tmp18_85 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc162) + %tmp20 = arith.extsi %y1_22 : tensor<256x1xi32> to tensor<256x1xi64> loc(#loc163) + %tmp20_87 = arith.constant dense<256> : tensor<256x1xi64> loc(#loc163) + %tmp20_88 = arith.cmpi sge, %tmp20, %tmp20_87 : tensor<256x1xi64> loc(#loc163) + %tmp21 = arith.constant 2304 : i64 loc(#loc164) + %tmp21_89 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc164) + %tmp22 = arith.extsi %y1_22 : tensor<256x1xi32> to tensor<256x1xi64> loc(#loc165) + %tmp22_90 = arith.constant dense<2304> : tensor<256x1xi64> loc(#loc165) + %tmp22_91 = arith.cmpi slt, %tmp22, %tmp22_90 : tensor<256x1xi64> loc(#loc165) + %tmp23 = arith.constant 128 : i32 loc(#loc166) + %tmp23_92 = arith.constant 128 : i32 loc(#loc166) + %tmp23_93 = arith.constant dense<128> : tensor<256x1xi32> loc(#loc166) + %tmp23_94 = arith.muli %tmp23_93, %y0_25 : tensor<256x1xi32> loc(#loc166) + %tmp23_95 = tt.broadcast %xindex_18 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc167) + %tmp23_96 = tt.broadcast %tmp23_94 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc167) + %tmp23_97 = arith.addi %tmp23_95, %tmp23_96 : tensor<256x16xi32> loc(#loc167) + %tmp23_98 = arith.constant -256 : i32 loc(#loc168) + %tmp23_99 = arith.constant -256 : i32 loc(#loc168) + %tmp23_100 = arith.constant dense<-256> : tensor<256x1xi32> loc(#loc168) + %tmp23_101 = arith.addi %tmp23_100, %y1_22 : tensor<256x1xi32> loc(#loc168) + %tmp23_102 = arith.constant 12288 : i32 loc(#loc169) + %tmp23_103 = arith.constant 12288 : i32 loc(#loc169) + %tmp23_104 = arith.constant dense<12288> : tensor<256x1xi32> loc(#loc169) + %tmp23_105 = arith.muli %tmp23_104, %tmp23_101 : tensor<256x1xi32> loc(#loc169) + %tmp23_106 = tt.broadcast %tmp23_105 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc170) + %tmp23_107 = arith.addi %tmp23_97, %tmp23_106 : tensor<256x16xi32> loc(#loc170) + %tmp23_108 = tt.splat %in_ptr3 : !tt.ptr -> tensor<256x16x!tt.ptr> loc(#loc171) + %tmp23_109 = tt.addptr %tmp23_108, %tmp23_107 : tensor<256x16x!tt.ptr>, tensor<256x16xi32> loc(#loc171) + %tmp23_110 = tt.broadcast %tmp20_88 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc172) + %tmp23_111 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc172) + %tmp23_112 = arith.andi %tmp23_110, %tmp23_111 : tensor<256x16xi1> loc(#loc172) + %tmp23_113 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc173) + %tmp23_114 = arith.andi %tmp23_112, %tmp23_113 : tensor<256x16xi1> loc(#loc173) + %tmp23_115 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %tmp23_116 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc174) + %tmp23_117 = arith.truncf %tmp23_116 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc174) + %tmp23_118 = tt.load %tmp23_109, %tmp23_114, %tmp23_117 evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc174) + %tmp23_119 = arith.extf %tmp23_118 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc175) + %tmp25 = arith.constant -256 : i32 loc(#loc176) + %tmp25_120 = arith.constant -256 : i32 loc(#loc176) + %tmp25_121 = arith.constant dense<-256> : tensor<256x1xi32> loc(#loc176) + %tmp25_122 = arith.addi %tmp25_121, %y1_22 : tensor<256x1xi32> loc(#loc176) + %tmp25_123 = arith.constant 32 : i32 loc(#loc177) + %tmp25_124 = arith.constant 32 : i32 loc(#loc177) + %tmp25_125 = arith.constant dense<32> : tensor<256x1xi32> loc(#loc177) + %tmp25_126 = arith.muli %tmp25_125, %tmp25_122 : tensor<256x1xi32> loc(#loc177) + %tmp25_127 = arith.addi %y0_25, %tmp25_126 : tensor<256x1xi32> loc(#loc178) + %tmp25_128 = tt.broadcast %tmp25_127 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc179) + %tmp25_129 = tt.splat %in_ptr4 : !tt.ptr -> tensor<256x16x!tt.ptr> loc(#loc180) + %tmp25_130 = tt.addptr %tmp25_129, %tmp25_128 : tensor<256x16x!tt.ptr>, tensor<256x16xi32> loc(#loc180) + %tmp25_131 = tt.broadcast %tmp20_88 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc181) + %tmp25_132 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc181) + %tmp25_133 = arith.andi %tmp25_131, %tmp25_132 : tensor<256x16xi1> loc(#loc181) + %tmp25_134 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc182) + %tmp25_135 = arith.andi %tmp25_133, %tmp25_134 : tensor<256x16xi1> loc(#loc182) + %tmp25_136 = arith.constant 0.000000e+00 : f32 loc(#loc183) + %tmp25_137 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc183) + %tmp25_138 = tt.load %tmp25_130, %tmp25_135, %tmp25_137 evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc183) + %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc184) + %tmp27 = arith.constant dense<1.280000e+02> : tensor<256x16xf32> loc(#loc185) + %tmp27_139 = arith.divf %tmp25_138, %tmp27 : tensor<256x16xf32> loc(#loc185) + %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc186) + %tmp29 = arith.constant dense<9.99999997E-7> : tensor<256x16xf32> loc(#loc187) + %tmp29_140 = arith.addf %tmp27_139, %tmp29 : tensor<256x16xf32> loc(#loc187) + %tmp30 = tt.extern_elementwise %tmp29_140 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32>) -> tensor<256x16xf32> loc(#loc188) + %tmp31 = arith.mulf %tmp23_119, %tmp30 : tensor<256x16xf32> loc(#loc189) + %tmp32 = tt.broadcast %xindex_18 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc190) + %tmp32_141 = tt.splat %in_ptr5 : !tt.ptr -> tensor<256x16x!tt.ptr> loc(#loc191) + %tmp32_142 = tt.addptr %tmp32_141, %tmp32 : tensor<256x16x!tt.ptr>, tensor<256x16xi32> loc(#loc191) + %tmp32_143 = tt.broadcast %tmp20_88 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc192) + %tmp32_144 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc192) + %tmp32_145 = arith.andi %tmp32_143, %tmp32_144 : tensor<256x16xi1> loc(#loc192) + %tmp32_146 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc193) + %tmp32_147 = arith.andi %tmp32_145, %tmp32_146 : tensor<256x16xi1> loc(#loc193) + %tmp32_148 = arith.constant 0.000000e+00 : f32 loc(#loc194) + %tmp32_149 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc194) + %tmp32_150 = arith.truncf %tmp32_149 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc194) + %tmp32_151 = tt.load %tmp32_142, %tmp32_147, %tmp32_150 evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc194) + %tmp32_152 = arith.extf %tmp32_151 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc195) + %tmp34 = arith.mulf %tmp31, %tmp32_152 : tensor<256x16xf32> loc(#loc196) + %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc197) + %tmp36_153 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc197) + %tmp37 = tt.broadcast %tmp20_88 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc198) + %tmp37_154 = arith.select %tmp37, %tmp34, %tmp36_153 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc198) + %tmp38 = tt.broadcast %tmp4_31 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc199) + %tmp38_155 = arith.select %tmp38, %tmp19_86, %tmp37_154 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc199) + %c128_i32 = arith.constant 128 : i32 loc(#loc93) + %c128_i32_156 = arith.constant 128 : i32 loc(#loc93) + %cst = arith.constant dense<128> : tensor<256x1xi32> loc(#loc93) + %0 = arith.muli %cst, %yindex_11 : tensor<256x1xi32> loc(#loc93) + %1 = tt.broadcast %xindex_18 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc94) + %2 = tt.broadcast %0 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc94) + %3 = arith.addi %1, %2 : tensor<256x16xi32> loc(#loc94) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x16x!tt.ptr> loc(#loc95) + %5 = tt.addptr %4, %3 : tensor<256x16x!tt.ptr>, tensor<256x16xi32> loc(#loc95) + %6 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc96) + %7 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc96) + %8 = arith.andi %6, %7 : tensor<256x16xi1> loc(#loc96) + %9 = arith.truncf %tmp38_155 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc97) + tt.store %5, %9, %8 : tensor<256x16x!tt.ptr> loc(#loc97) + tt.return loc(#loc98) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:60) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:70) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc108 = loc("ynumel"(#loc1)) +#loc109 = loc("xnumel"(#loc2)) +#loc110 = loc("yoffset"(#loc3)) +#loc111 = loc("yoffset"(#loc4)) +#loc112 = loc("yoffset"(#loc5)) +#loc113 = loc("yoffset"(#loc6)) +#loc114 = loc("yoffset"(#loc7)) +#loc115 = loc("yoffset"(#loc8)) +#loc116 = loc("yindex"(#loc9)) +#loc117 = loc("yindex"(#loc10)) +#loc118 = loc("yindex"(#loc11)) +#loc119 = loc("ymask"(#loc12)) +#loc120 = loc("xoffset"(#loc13)) +#loc121 = loc("xoffset"(#loc14)) +#loc122 = loc("xindex"(#loc15)) +#loc123 = loc("xindex"(#loc16)) +#loc124 = loc("xindex"(#loc17)) +#loc125 = loc("xmask"(#loc18)) +#loc126 = loc("y1"(#loc19)) +#loc127 = loc("y0"(#loc20)) +#loc128 = loc("tmp1"(#loc21)) +#loc129 = loc("tmp2"(#loc22)) +#loc130 = loc("tmp3"(#loc23)) +#loc131 = loc("tmp4"(#loc24)) +#loc132 = loc("tmp5"(#loc25)) +#loc133 = loc("tmp5"(#loc26)) +#loc134 = loc("tmp5"(#loc27)) +#loc135 = loc("tmp5"(#loc28)) +#loc136 = loc("tmp5"(#loc29)) +#loc137 = loc("tmp5"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp7"(#loc34)) +#loc142 = loc("tmp7"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp7"(#loc37)) +#loc145 = loc("tmp7"(#loc38)) +#loc146 = loc("tmp7"(#loc39)) +#loc147 = loc("tmp7"(#loc40)) +#loc148 = loc("tmp8"(#loc41)) +#loc149 = loc("tmp9"(#loc42)) +#loc150 = loc("tmp10"(#loc43)) +#loc151 = loc("tmp11"(#loc44)) +#loc152 = loc("tmp12"(#loc45)) +#loc153 = loc("tmp13"(#loc46)) +#loc154 = loc("tmp14"(#loc47)) +#loc155 = loc("tmp14"(#loc48)) +#loc156 = loc("tmp14"(#loc49)) +#loc157 = loc("tmp14"(#loc50)) +#loc158 = loc("tmp14"(#loc51)) +#loc159 = loc("tmp14"(#loc52)) +#loc160 = loc("tmp16"(#loc53)) +#loc161 = loc("tmp18"(#loc54)) +#loc162 = loc("tmp19"(#loc55)) +#loc163 = loc("tmp20"(#loc56)) +#loc164 = loc("tmp21"(#loc57)) +#loc165 = loc("tmp22"(#loc58)) +#loc166 = loc("tmp23"(#loc59)) +#loc167 = loc("tmp23"(#loc60)) +#loc168 = loc("tmp23"(#loc61)) +#loc169 = loc("tmp23"(#loc62)) +#loc170 = loc("tmp23"(#loc63)) +#loc171 = loc("tmp23"(#loc64)) +#loc172 = loc("tmp23"(#loc65)) +#loc173 = loc("tmp23"(#loc66)) +#loc174 = loc("tmp23"(#loc67)) +#loc175 = loc("tmp23"(#loc68)) +#loc176 = loc("tmp25"(#loc69)) +#loc177 = loc("tmp25"(#loc70)) +#loc178 = loc("tmp25"(#loc71)) +#loc179 = loc("tmp25"(#loc72)) +#loc180 = loc("tmp25"(#loc73)) +#loc181 = loc("tmp25"(#loc74)) +#loc182 = loc("tmp25"(#loc75)) +#loc183 = loc("tmp25"(#loc76)) +#loc184 = loc("tmp26"(#loc77)) +#loc185 = loc("tmp27"(#loc78)) +#loc186 = loc("tmp28"(#loc79)) +#loc187 = loc("tmp29"(#loc80)) +#loc188 = loc("tmp30"(#loc81)) +#loc189 = loc("tmp31"(#loc82)) +#loc190 = loc("tmp32"(#loc83)) +#loc191 = loc("tmp32"(#loc84)) +#loc192 = loc("tmp32"(#loc85)) +#loc193 = loc("tmp32"(#loc86)) +#loc194 = loc("tmp32"(#loc87)) +#loc195 = loc("tmp32"(#loc88)) +#loc196 = loc("tmp34"(#loc89)) +#loc197 = loc("tmp36"(#loc90)) +#loc198 = loc("tmp37"(#loc91)) +#loc199 = loc("tmp38"(#loc92)) diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..c644293bb8f8b37f41fda54506ed01135819c69d --- /dev/null +++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir @@ -0,0 +1,288 @@ +#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [16, 2], warpsPerCTA = [8, 1], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("in_ptr4"(#loc)) +#loc75 = loc("in_ptr5"(#loc)) +#loc76 = loc("out_ptr0"(#loc)) +#loc77 = loc("ynumel"(#loc)) +#loc78 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<-256> : tensor<256x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<-256> : tensor<256x1xi32, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<256x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<256x1xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<256> : tensor<256x1xi64, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<256> : tensor<256x1xi64, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<32> : tensor<256x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<32> : tensor<256x1xi32, #blocked1> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<1x16xi32, #blocked1> loc(#loc1) + %cst_9 = arith.constant dense<73728> : tensor<256x1xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<73728> : tensor<256x1xi32, #blocked1> loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<256x16xbf16, #blocked1> loc(#loc1) + %cst_12 = arith.constant dense<0.000000e+00> : tensor<256x16xf32, #blocked> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %cst_13 = arith.constant dense<9.99999997E-7> : tensor<256x16xf32, #blocked> loc(#loc1) + %cst_14 = arith.constant dense<1.280000e+02> : tensor<256x16xf32, #blocked> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<256x16xf32, #blocked1> loc(#loc1) + %yoffset = tt.get_program_id y : i32 loc(#loc79) + %yoffset_16 = tt.get_program_id z : i32 loc(#loc80) + %yoffset_17 = tt.get_num_programs y : i32 loc(#loc81) + %yoffset_18 = arith.muli %yoffset_16, %yoffset_17 : i32 loc(#loc82) + %yoffset_19 = arith.addi %yoffset, %yoffset_18 : i32 loc(#loc83) + %yoffset_20 = arith.muli %yoffset_19, %c256_i32 : i32 loc(#loc84) + %yindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc85) + %yindex_21 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc85) + %yindex_22 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<256xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<256x1xi32, #blocked1> loc(#loc85) + %yindex_23 = tt.expand_dims %yindex_21 {axis = 1 : i32} : tensor<256xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<256x1xi32, #blocked> loc(#loc85) + %yindex_24 = tt.splat %yoffset_20 : i32 -> tensor<256x1xi32, #blocked1> loc(#loc86) + %yindex_25 = tt.splat %yoffset_20 : i32 -> tensor<256x1xi32, #blocked> loc(#loc86) + %yindex_26 = arith.addi %yindex_24, %yindex_22 : tensor<256x1xi32, #blocked1> loc(#loc86) + %yindex_27 = arith.addi %yindex_25, %yindex_23 : tensor<256x1xi32, #blocked> loc(#loc86) + %ymask = arith.cmpi slt, %yindex_26, %cst_10 : tensor<256x1xi32, #blocked1> loc(#loc87) + %ymask_28 = arith.cmpi slt, %yindex_27, %cst_9 : tensor<256x1xi32, #blocked> loc(#loc87) + %xoffset = tt.get_program_id x : i32 loc(#loc88) + %xoffset_29 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc89) + %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc90) + %xindex_30 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc90) + %xindex_31 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc90) + %xindex_32 = tt.expand_dims %xindex_30 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc90) + %xindex_33 = tt.splat %xoffset_29 : i32 -> tensor<1x16xi32, #blocked1> loc(#loc91) + %xindex_34 = tt.splat %xoffset_29 : i32 -> tensor<1x16xi32, #blocked> loc(#loc91) + %xindex_35 = arith.addi %xindex_33, %xindex_31 : tensor<1x16xi32, #blocked1> loc(#loc91) + %xindex_36 = arith.addi %xindex_34, %xindex_32 : tensor<1x16xi32, #blocked> loc(#loc91) + %xmask = arith.cmpi slt, %xindex_35, %cst_8 : tensor<1x16xi32, #blocked1> loc(#loc92) + %xmask_37 = arith.cmpi slt, %xindex_36, %cst_7 : tensor<1x16xi32, #blocked> loc(#loc92) + %y1 = arith.divsi %yindex_26, %cst_6 : tensor<256x1xi32, #blocked1> loc(#loc93) + %y1_38 = arith.divsi %yindex_27, %cst_5 : tensor<256x1xi32, #blocked> loc(#loc93) + %y0 = arith.remsi %yindex_26, %cst_6 : tensor<256x1xi32, #blocked1> loc(#loc94) + %y0_39 = arith.remsi %yindex_27, %cst_5 : tensor<256x1xi32, #blocked> loc(#loc94) + %tmp4 = arith.extsi %y1 : tensor<256x1xi32, #blocked1> to tensor<256x1xi64, #blocked1> loc(#loc95) + %tmp4_40 = arith.extsi %y1_38 : tensor<256x1xi32, #blocked> to tensor<256x1xi64, #blocked> loc(#loc95) + %tmp4_41 = arith.cmpi slt, %tmp4, %cst_4 : tensor<256x1xi64, #blocked1> loc(#loc95) + %tmp4_42 = arith.cmpi slt, %tmp4_40, %cst_3 : tensor<256x1xi64, #blocked> loc(#loc95) + %tmp5 = arith.muli %y0, %cst_2 : tensor<256x1xi32, #blocked1> loc(#loc96) + %tmp5_43 = tt.broadcast %xindex_35 : tensor<1x16xi32, #blocked1> -> tensor<256x16xi32, #blocked1> loc(#loc97) + %tmp5_44 = tt.broadcast %tmp5 : tensor<256x1xi32, #blocked1> -> tensor<256x16xi32, #blocked1> loc(#loc97) + %tmp5_45 = arith.addi %tmp5_43, %tmp5_44 : tensor<256x16xi32, #blocked1> loc(#loc97) + %tmp5_46 = arith.muli %y1, %cst_1 : tensor<256x1xi32, #blocked1> loc(#loc98) + %tmp5_47 = tt.broadcast %tmp5_46 : tensor<256x1xi32, #blocked1> -> tensor<256x16xi32, #blocked1> loc(#loc99) + %tmp5_48 = arith.addi %tmp5_45, %tmp5_47 : tensor<256x16xi32, #blocked1> loc(#loc99) + %tmp5_49 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x16x!tt.ptr, #blocked1> loc(#loc100) + %tmp5_50 = tt.addptr %tmp5_49, %tmp5_48 : tensor<256x16x!tt.ptr, #blocked1>, tensor<256x16xi32, #blocked1> loc(#loc100) + %tmp5_51 = tt.broadcast %tmp4_41 : tensor<256x1xi1, #blocked1> -> tensor<256x16xi1, #blocked1> loc(#loc101) + %tmp5_52 = tt.broadcast %tmp4_42 : tensor<256x1xi1, #blocked> -> tensor<256x16xi1, #blocked> loc(#loc101) + %tmp5_53 = tt.broadcast %xmask : tensor<1x16xi1, #blocked1> -> tensor<256x16xi1, #blocked1> loc(#loc101) + %tmp5_54 = tt.broadcast %xmask_37 : tensor<1x16xi1, #blocked> -> tensor<256x16xi1, #blocked> loc(#loc101) + %tmp5_55 = arith.andi %tmp5_51, %tmp5_53 : tensor<256x16xi1, #blocked1> loc(#loc101) + %tmp5_56 = arith.andi %tmp5_52, %tmp5_54 : tensor<256x16xi1, #blocked> loc(#loc101) + %tmp5_57 = tt.broadcast %ymask : tensor<256x1xi1, #blocked1> -> tensor<256x16xi1, #blocked1> loc(#loc102) + %tmp5_58 = tt.broadcast %ymask_28 : tensor<256x1xi1, #blocked> -> tensor<256x16xi1, #blocked> loc(#loc102) + %tmp5_59 = arith.andi %tmp5_55, %tmp5_57 : tensor<256x16xi1, #blocked1> loc(#loc102) + %tmp5_60 = arith.andi %tmp5_56, %tmp5_58 : tensor<256x16xi1, #blocked> loc(#loc102) + %tmp5_61 = tt.load %tmp5_50, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<256x16x!tt.ptr, #blocked1> loc(#loc103) + %tmp5_62 = ttg.convert_layout %tmp5_61 : tensor<256x16xbf16, #blocked1> -> tensor<256x16xbf16, #blocked> loc(#loc104) + %tmp5_63 = arith.extf %tmp5_62 : tensor<256x16xbf16, #blocked> to tensor<256x16xf32, #blocked> loc(#loc104) + %tmp7 = arith.muli %y1_38, %cst_5 : tensor<256x1xi32, #blocked> loc(#loc105) + %tmp7_64 = arith.addi %y0_39, %tmp7 : tensor<256x1xi32, #blocked> loc(#loc106) + %tmp7_65 = tt.splat %in_ptr1 : !tt.ptr -> tensor<256x1x!tt.ptr, #blocked> loc(#loc107) + %tmp7_66 = tt.addptr %tmp7_65, %tmp7_64 : tensor<256x1x!tt.ptr, #blocked>, tensor<256x1xi32, #blocked> loc(#loc107) + %tmp7_67 = tt.broadcast %tmp7_66 : tensor<256x1x!tt.ptr, #blocked> -> tensor<256x16x!tt.ptr, #blocked> loc(#loc107) + %tmp7_68 = tt.load %tmp7_67, %tmp5_60, %cst_12 evictionPolicy = evict_last : tensor<256x16x!tt.ptr, #blocked> loc(#loc108) + %tmp9 = arith.divf %tmp7_68, %cst_14 : tensor<256x16xf32, #blocked> loc(#loc109) + %tmp11 = arith.addf %tmp9, %cst_13 : tensor<256x16xf32, #blocked> loc(#loc110) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32, #blocked>) -> tensor<256x16xf32, #blocked> loc(#loc111) + %tmp13 = arith.mulf %tmp5_63, %tmp12 : tensor<256x16xf32, #blocked> loc(#loc112) + %tmp13_69 = ttg.convert_layout %tmp13 : tensor<256x16xf32, #blocked> -> tensor<256x16xf32, #blocked1> loc(#loc112) + %tmp14 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked1> loc(#loc113) + %tmp14_70 = tt.addptr %tmp14, %xindex_35 : tensor<1x16x!tt.ptr, #blocked1>, tensor<1x16xi32, #blocked1> loc(#loc113) + %tmp14_71 = tt.broadcast %tmp14_70 : tensor<1x16x!tt.ptr, #blocked1> -> tensor<256x16x!tt.ptr, #blocked1> loc(#loc113) + %tmp14_72 = tt.load %tmp14_71, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<256x16x!tt.ptr, #blocked1> loc(#loc114) + %tmp14_73 = arith.extf %tmp14_72 : tensor<256x16xbf16, #blocked1> to tensor<256x16xf32, #blocked1> loc(#loc115) + %tmp16 = arith.mulf %tmp13_69, %tmp14_73 : tensor<256x16xf32, #blocked1> loc(#loc116) + %tmp20 = arith.cmpi sge, %tmp4, %cst_4 : tensor<256x1xi64, #blocked1> loc(#loc117) + %tmp20_74 = arith.cmpi sge, %tmp4_40, %cst_3 : tensor<256x1xi64, #blocked> loc(#loc117) + %tmp23 = arith.addi %y1, %cst_0 : tensor<256x1xi32, #blocked1> loc(#loc118) + %tmp23_75 = arith.addi %y1_38, %cst : tensor<256x1xi32, #blocked> loc(#loc118) + %tmp23_76 = arith.muli %tmp23, %cst_1 : tensor<256x1xi32, #blocked1> loc(#loc119) + %tmp23_77 = tt.broadcast %tmp23_76 : tensor<256x1xi32, #blocked1> -> tensor<256x16xi32, #blocked1> loc(#loc120) + %tmp23_78 = arith.addi %tmp5_45, %tmp23_77 : tensor<256x16xi32, #blocked1> loc(#loc120) + %tmp23_79 = tt.splat %in_ptr3 : !tt.ptr -> tensor<256x16x!tt.ptr, #blocked1> loc(#loc121) + %tmp23_80 = tt.addptr %tmp23_79, %tmp23_78 : tensor<256x16x!tt.ptr, #blocked1>, tensor<256x16xi32, #blocked1> loc(#loc121) + %tmp23_81 = tt.broadcast %tmp20 : tensor<256x1xi1, #blocked1> -> tensor<256x16xi1, #blocked1> loc(#loc122) + %tmp23_82 = tt.broadcast %tmp20_74 : tensor<256x1xi1, #blocked> -> tensor<256x16xi1, #blocked> loc(#loc122) + %tmp23_83 = arith.andi %tmp23_81, %tmp5_53 : tensor<256x16xi1, #blocked1> loc(#loc122) + %tmp23_84 = arith.andi %tmp23_82, %tmp5_54 : tensor<256x16xi1, #blocked> loc(#loc122) + %tmp23_85 = arith.andi %tmp23_83, %tmp5_57 : tensor<256x16xi1, #blocked1> loc(#loc123) + %tmp23_86 = arith.andi %tmp23_84, %tmp5_58 : tensor<256x16xi1, #blocked> loc(#loc123) + %tmp23_87 = tt.load %tmp23_80, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<256x16x!tt.ptr, #blocked1> loc(#loc124) + %tmp23_88 = ttg.convert_layout %tmp23_87 : tensor<256x16xbf16, #blocked1> -> tensor<256x16xbf16, #blocked> loc(#loc125) + %tmp23_89 = arith.extf %tmp23_88 : tensor<256x16xbf16, #blocked> to tensor<256x16xf32, #blocked> loc(#loc125) + %tmp25 = arith.muli %tmp23_75, %cst_5 : tensor<256x1xi32, #blocked> loc(#loc126) + %tmp25_90 = arith.addi %y0_39, %tmp25 : tensor<256x1xi32, #blocked> loc(#loc127) + %tmp25_91 = tt.splat %in_ptr4 : !tt.ptr -> tensor<256x1x!tt.ptr, #blocked> loc(#loc128) + %tmp25_92 = tt.addptr %tmp25_91, %tmp25_90 : tensor<256x1x!tt.ptr, #blocked>, tensor<256x1xi32, #blocked> loc(#loc128) + %tmp25_93 = tt.broadcast %tmp25_92 : tensor<256x1x!tt.ptr, #blocked> -> tensor<256x16x!tt.ptr, #blocked> loc(#loc128) + %tmp25_94 = tt.load %tmp25_93, %tmp23_86, %cst_12 evictionPolicy = evict_last : tensor<256x16x!tt.ptr, #blocked> loc(#loc129) + %tmp27 = arith.divf %tmp25_94, %cst_14 : tensor<256x16xf32, #blocked> loc(#loc130) + %tmp29 = arith.addf %tmp27, %cst_13 : tensor<256x16xf32, #blocked> loc(#loc131) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32, #blocked>) -> tensor<256x16xf32, #blocked> loc(#loc132) + %tmp31 = arith.mulf %tmp23_89, %tmp30 : tensor<256x16xf32, #blocked> loc(#loc133) + %tmp31_95 = ttg.convert_layout %tmp31 : tensor<256x16xf32, #blocked> -> tensor<256x16xf32, #blocked1> loc(#loc133) + %tmp32 = tt.splat %in_ptr5 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked1> loc(#loc134) + %tmp32_96 = tt.addptr %tmp32, %xindex_35 : tensor<1x16x!tt.ptr, #blocked1>, tensor<1x16xi32, #blocked1> loc(#loc134) + %tmp32_97 = tt.broadcast %tmp32_96 : tensor<1x16x!tt.ptr, #blocked1> -> tensor<256x16x!tt.ptr, #blocked1> loc(#loc134) + %tmp32_98 = tt.load %tmp32_97, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<256x16x!tt.ptr, #blocked1> loc(#loc135) + %tmp32_99 = arith.extf %tmp32_98 : tensor<256x16xbf16, #blocked1> to tensor<256x16xf32, #blocked1> loc(#loc136) + %tmp34 = arith.mulf %tmp31_95, %tmp32_99 : tensor<256x16xf32, #blocked1> loc(#loc137) + %tmp37 = arith.select %tmp23_81, %tmp34, %cst_15 : tensor<256x16xi1, #blocked1>, tensor<256x16xf32, #blocked1> loc(#loc138) + %tmp38 = arith.select %tmp5_51, %tmp16, %tmp37 : tensor<256x16xi1, #blocked1>, tensor<256x16xf32, #blocked1> loc(#loc141) + %0 = arith.muli %yindex_26, %cst_2 : tensor<256x1xi32, #blocked1> loc(#loc64) + %1 = tt.broadcast %0 : tensor<256x1xi32, #blocked1> -> tensor<256x16xi32, #blocked1> loc(#loc65) + %2 = arith.addi %tmp5_43, %1 : tensor<256x16xi32, #blocked1> loc(#loc65) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x16x!tt.ptr, #blocked1> loc(#loc66) + %4 = tt.addptr %3, %2 : tensor<256x16x!tt.ptr, #blocked1>, tensor<256x16xi32, #blocked1> loc(#loc66) + %5 = arith.andi %tmp5_53, %tmp5_57 : tensor<256x16xi1, #blocked1> loc(#loc67) + %6 = arith.truncf %tmp38 : tensor<256x16xf32, #blocked1> to tensor<256x16xbf16, #blocked1> loc(#loc68) + tt.store %4, %6, %5 : tensor<256x16x!tt.ptr, #blocked1> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc79 = loc("yoffset"(#loc2)) +#loc80 = loc("yoffset"(#loc3)) +#loc81 = loc("yoffset"(#loc4)) +#loc82 = loc("yoffset"(#loc5)) +#loc83 = loc("yoffset"(#loc6)) +#loc84 = loc("yoffset"(#loc7)) +#loc85 = loc("yindex"(#loc8)) +#loc86 = loc("yindex"(#loc9)) +#loc87 = loc("ymask"(#loc10)) +#loc88 = loc("xoffset"(#loc11)) +#loc89 = loc("xoffset"(#loc12)) +#loc90 = loc("xindex"(#loc13)) +#loc91 = loc("xindex"(#loc14)) +#loc92 = loc("xmask"(#loc15)) +#loc93 = loc("y1"(#loc16)) +#loc94 = loc("y0"(#loc17)) +#loc95 = loc("tmp4"(#loc18)) +#loc96 = loc("tmp5"(#loc19)) +#loc97 = loc("tmp5"(#loc20)) +#loc98 = loc("tmp5"(#loc21)) +#loc99 = loc("tmp5"(#loc22)) +#loc100 = loc("tmp5"(#loc23)) +#loc101 = loc("tmp5"(#loc24)) +#loc102 = loc("tmp5"(#loc25)) +#loc103 = loc("tmp5"(#loc26)) +#loc104 = loc("tmp5"(#loc27)) +#loc105 = loc("tmp7"(#loc28)) +#loc106 = loc("tmp7"(#loc29)) +#loc107 = loc("tmp7"(#loc30)) +#loc108 = loc("tmp7"(#loc31)) +#loc109 = loc("tmp9"(#loc32)) +#loc110 = loc("tmp11"(#loc33)) +#loc111 = loc("tmp12"(#loc34)) +#loc112 = loc("tmp13"(#loc35)) +#loc113 = loc("tmp14"(#loc36)) +#loc114 = loc("tmp14"(#loc37)) +#loc115 = loc("tmp14"(#loc38)) +#loc116 = loc("tmp16"(#loc39)) +#loc117 = loc("tmp20"(#loc40)) +#loc118 = loc("tmp23"(#loc41)) +#loc119 = loc("tmp23"(#loc42)) +#loc120 = loc("tmp23"(#loc43)) +#loc121 = loc("tmp23"(#loc44)) +#loc122 = loc("tmp23"(#loc45)) +#loc123 = loc("tmp23"(#loc46)) +#loc124 = loc("tmp23"(#loc47)) +#loc125 = loc("tmp23"(#loc48)) +#loc126 = loc("tmp25"(#loc49)) +#loc127 = loc("tmp25"(#loc50)) +#loc128 = loc("tmp25"(#loc51)) +#loc129 = loc("tmp25"(#loc52)) +#loc130 = loc("tmp27"(#loc53)) +#loc131 = loc("tmp29"(#loc54)) +#loc132 = loc("tmp30"(#loc55)) +#loc133 = loc("tmp31"(#loc56)) +#loc134 = loc("tmp32"(#loc57)) +#loc135 = loc("tmp32"(#loc58)) +#loc136 = loc("tmp32"(#loc59)) +#loc137 = loc("tmp34"(#loc60)) +#loc138 = loc("tmp37"(#loc61)) +#loc139 = loc("tmp38"(#loc62)) +#loc140 = loc("tmp19"(#loc63)) +#loc141 = loc(fused[#loc139, #loc140]) diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c52775319a63d328fe709f5d56118eb499b143d7 --- /dev/null +++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir @@ -0,0 +1,256 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("in_ptr1"(#loc)) +#loc74 = loc("in_ptr2"(#loc)) +#loc75 = loc("in_ptr3"(#loc)) +#loc76 = loc("in_ptr4"(#loc)) +#loc77 = loc("in_ptr5"(#loc)) +#loc78 = loc("out_ptr0"(#loc)) +#loc79 = loc("ynumel"(#loc)) +#loc80 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<256x16xbf16> loc(#loc1) + %cst_0 = arith.constant dense<-256> : tensor<256x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<9.99999997E-7> : tensor<256x16xf32> loc(#loc1) + %cst_2 = arith.constant dense<1.280000e+02> : tensor<256x16xf32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc1) + %cst_4 = arith.constant dense<12288> : tensor<256x1xi32> loc(#loc1) + %cst_5 = arith.constant dense<128> : tensor<256x1xi32> loc(#loc1) + %cst_6 = arith.constant dense<256> : tensor<256x1xi64> loc(#loc1) + %cst_7 = arith.constant dense<32> : tensor<256x1xi32> loc(#loc1) + %xmask = arith.constant dense<128> : tensor<1x16xi32> loc(#loc81) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %ymask = arith.constant dense<73728> : tensor<256x1xi32> loc(#loc82) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %yoffset = tt.get_program_id y : i32 loc(#loc83) + %yoffset_8 = tt.get_program_id z : i32 loc(#loc84) + %yoffset_9 = tt.get_num_programs y : i32 loc(#loc85) + %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc86) + %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc87) + %yoffset_12 = arith.muli %yoffset_11, %c256_i32 : i32 loc(#loc88) + %yindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc89) + %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<256xi32> -> tensor<256x1xi32> loc(#loc90) + %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<256x1xi32> loc(#loc91) + %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<256x1xi32> loc(#loc91) + %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<256x1xi32> loc(#loc82) + %xoffset = tt.get_program_id x : i32 loc(#loc92) + %xoffset_17 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc93) + %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc94) + %xindex_18 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc95) + %xindex_19 = tt.splat %xoffset_17 : i32 -> tensor<1x16xi32> loc(#loc96) + %xindex_20 = arith.addi %xindex_19, %xindex_18 : tensor<1x16xi32> loc(#loc96) + %xmask_21 = arith.cmpi slt, %xindex_20, %xmask : tensor<1x16xi32> loc(#loc81) + %y1 = arith.divsi %yindex_15, %cst_7 : tensor<256x1xi32> loc(#loc97) + %y0 = arith.remsi %yindex_15, %cst_7 : tensor<256x1xi32> loc(#loc98) + %tmp4 = arith.extsi %y1 : tensor<256x1xi32> to tensor<256x1xi64> loc(#loc99) + %tmp4_22 = arith.cmpi slt, %tmp4, %cst_6 : tensor<256x1xi64> loc(#loc99) + %tmp5 = arith.muli %y0, %cst_5 : tensor<256x1xi32> loc(#loc100) + %tmp5_23 = tt.broadcast %xindex_20 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc101) + %tmp5_24 = tt.broadcast %tmp5 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc101) + %tmp5_25 = arith.addi %tmp5_23, %tmp5_24 : tensor<256x16xi32> loc(#loc101) + %tmp5_26 = arith.muli %y1, %cst_4 : tensor<256x1xi32> loc(#loc102) + %tmp5_27 = tt.broadcast %tmp5_26 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc103) + %tmp5_28 = arith.addi %tmp5_25, %tmp5_27 : tensor<256x16xi32> loc(#loc103) + %tmp5_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x16x!tt.ptr> loc(#loc104) + %tmp5_30 = tt.addptr %tmp5_29, %tmp5_28 : tensor<256x16x!tt.ptr>, tensor<256x16xi32> loc(#loc104) + %tmp5_31 = tt.broadcast %tmp4_22 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc105) + %tmp5_32 = tt.broadcast %xmask_21 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc105) + %tmp5_33 = arith.andi %tmp5_31, %tmp5_32 : tensor<256x16xi1> loc(#loc105) + %tmp5_34 = tt.broadcast %ymask_16 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc106) + %tmp5_35 = arith.andi %tmp5_33, %tmp5_34 : tensor<256x16xi1> loc(#loc106) + %tmp5_36 = tt.load %tmp5_30, %tmp5_35, %cst evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc107) + %tmp5_37 = arith.extf %tmp5_36 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc108) + %tmp7 = arith.muli %y1, %cst_7 : tensor<256x1xi32> loc(#loc109) + %tmp7_38 = arith.addi %y0, %tmp7 : tensor<256x1xi32> loc(#loc110) + %tmp7_39 = tt.splat %in_ptr1 : !tt.ptr -> tensor<256x1x!tt.ptr> loc(#loc111) + %tmp7_40 = tt.addptr %tmp7_39, %tmp7_38 : tensor<256x1x!tt.ptr>, tensor<256x1xi32> loc(#loc111) + %tmp7_41 = tt.broadcast %tmp7_40 : tensor<256x1x!tt.ptr> -> tensor<256x16x!tt.ptr> loc(#loc111) + %tmp7_42 = tt.load %tmp7_41, %tmp5_35, %cst_3 evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc112) + %tmp9 = arith.divf %tmp7_42, %cst_2 : tensor<256x16xf32> loc(#loc113) + %tmp11 = arith.addf %tmp9, %cst_1 : tensor<256x16xf32> loc(#loc114) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32>) -> tensor<256x16xf32> loc(#loc115) + %tmp13 = arith.mulf %tmp5_37, %tmp12 : tensor<256x16xf32> loc(#loc116) + %tmp14 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc117) + %tmp14_43 = tt.addptr %tmp14, %xindex_20 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc117) + %tmp14_44 = tt.broadcast %tmp14_43 : tensor<1x16x!tt.ptr> -> tensor<256x16x!tt.ptr> loc(#loc117) + %tmp14_45 = tt.load %tmp14_44, %tmp5_35, %cst evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc118) + %tmp14_46 = arith.extf %tmp14_45 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc119) + %tmp16 = arith.mulf %tmp13, %tmp14_46 : tensor<256x16xf32> loc(#loc120) + %tmp19 = arith.select %tmp5_31, %tmp16, %cst_3 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc121) + %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<256x1xi64> loc(#loc122) + %tmp23 = arith.addi %y1, %cst_0 : tensor<256x1xi32> loc(#loc123) + %tmp23_47 = arith.muli %tmp23, %cst_4 : tensor<256x1xi32> loc(#loc124) + %tmp23_48 = tt.broadcast %tmp23_47 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc125) + %tmp23_49 = arith.addi %tmp5_25, %tmp23_48 : tensor<256x16xi32> loc(#loc125) + %tmp23_50 = tt.splat %in_ptr3 : !tt.ptr -> tensor<256x16x!tt.ptr> loc(#loc126) + %tmp23_51 = tt.addptr %tmp23_50, %tmp23_49 : tensor<256x16x!tt.ptr>, tensor<256x16xi32> loc(#loc126) + %tmp23_52 = tt.broadcast %tmp20 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc127) + %tmp23_53 = arith.andi %tmp23_52, %tmp5_32 : tensor<256x16xi1> loc(#loc127) + %tmp23_54 = arith.andi %tmp23_53, %tmp5_34 : tensor<256x16xi1> loc(#loc128) + %tmp23_55 = tt.load %tmp23_51, %tmp23_54, %cst evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc129) + %tmp23_56 = arith.extf %tmp23_55 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc130) + %tmp25 = arith.muli %tmp23, %cst_7 : tensor<256x1xi32> loc(#loc131) + %tmp25_57 = arith.addi %y0, %tmp25 : tensor<256x1xi32> loc(#loc132) + %tmp25_58 = tt.splat %in_ptr4 : !tt.ptr -> tensor<256x1x!tt.ptr> loc(#loc133) + %tmp25_59 = tt.addptr %tmp25_58, %tmp25_57 : tensor<256x1x!tt.ptr>, tensor<256x1xi32> loc(#loc133) + %tmp25_60 = tt.broadcast %tmp25_59 : tensor<256x1x!tt.ptr> -> tensor<256x16x!tt.ptr> loc(#loc133) + %tmp25_61 = tt.load %tmp25_60, %tmp23_54, %cst_3 evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc134) + %tmp27 = arith.divf %tmp25_61, %cst_2 : tensor<256x16xf32> loc(#loc135) + %tmp29 = arith.addf %tmp27, %cst_1 : tensor<256x16xf32> loc(#loc136) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32>) -> tensor<256x16xf32> loc(#loc137) + %tmp31 = arith.mulf %tmp23_56, %tmp30 : tensor<256x16xf32> loc(#loc138) + %tmp32 = tt.splat %in_ptr5 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc139) + %tmp32_62 = tt.addptr %tmp32, %xindex_20 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc139) + %tmp32_63 = tt.broadcast %tmp32_62 : tensor<1x16x!tt.ptr> -> tensor<256x16x!tt.ptr> loc(#loc139) + %tmp32_64 = tt.load %tmp32_63, %tmp23_54, %cst evictionPolicy = evict_last : tensor<256x16x!tt.ptr> loc(#loc140) + %tmp32_65 = arith.extf %tmp32_64 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc141) + %tmp34 = arith.mulf %tmp31, %tmp32_65 : tensor<256x16xf32> loc(#loc142) + %tmp37 = arith.select %tmp23_52, %tmp34, %cst_3 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc143) + %tmp38 = arith.select %tmp5_31, %tmp19, %tmp37 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc144) + %0 = arith.muli %yindex_15, %cst_5 : tensor<256x1xi32> loc(#loc66) + %1 = tt.broadcast %0 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc67) + %2 = arith.addi %tmp5_23, %1 : tensor<256x16xi32> loc(#loc67) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x16x!tt.ptr> loc(#loc68) + %4 = tt.addptr %3, %2 : tensor<256x16x!tt.ptr>, tensor<256x16xi32> loc(#loc68) + %5 = arith.andi %tmp5_32, %tmp5_34 : tensor<256x16xi1> loc(#loc69) + %6 = arith.truncf %tmp38 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc70) + tt.store %4, %6, %5 : tensor<256x16x!tt.ptr> loc(#loc70) + tt.return loc(#loc71) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc81 = loc("xmask"(#loc2)) +#loc82 = loc("ymask"(#loc3)) +#loc83 = loc("yoffset"(#loc4)) +#loc84 = loc("yoffset"(#loc5)) +#loc85 = loc("yoffset"(#loc6)) +#loc86 = loc("yoffset"(#loc7)) +#loc87 = loc("yoffset"(#loc8)) +#loc88 = loc("yoffset"(#loc9)) +#loc89 = loc("yindex"(#loc10)) +#loc90 = loc("yindex"(#loc11)) +#loc91 = loc("yindex"(#loc12)) +#loc92 = loc("xoffset"(#loc13)) +#loc93 = loc("xoffset"(#loc14)) +#loc94 = loc("xindex"(#loc15)) +#loc95 = loc("xindex"(#loc16)) +#loc96 = loc("xindex"(#loc17)) +#loc97 = loc("y1"(#loc18)) +#loc98 = loc("y0"(#loc19)) +#loc99 = loc("tmp4"(#loc20)) +#loc100 = loc("tmp5"(#loc21)) +#loc101 = loc("tmp5"(#loc22)) +#loc102 = loc("tmp5"(#loc23)) +#loc103 = loc("tmp5"(#loc24)) +#loc104 = loc("tmp5"(#loc25)) +#loc105 = loc("tmp5"(#loc26)) +#loc106 = loc("tmp5"(#loc27)) +#loc107 = loc("tmp5"(#loc28)) +#loc108 = loc("tmp5"(#loc29)) +#loc109 = loc("tmp7"(#loc30)) +#loc110 = loc("tmp7"(#loc31)) +#loc111 = loc("tmp7"(#loc32)) +#loc112 = loc("tmp7"(#loc33)) +#loc113 = loc("tmp9"(#loc34)) +#loc114 = loc("tmp11"(#loc35)) +#loc115 = loc("tmp12"(#loc36)) +#loc116 = loc("tmp13"(#loc37)) +#loc117 = loc("tmp14"(#loc38)) +#loc118 = loc("tmp14"(#loc39)) +#loc119 = loc("tmp14"(#loc40)) +#loc120 = loc("tmp16"(#loc41)) +#loc121 = loc("tmp19"(#loc42)) +#loc122 = loc("tmp20"(#loc43)) +#loc123 = loc("tmp23"(#loc44)) +#loc124 = loc("tmp23"(#loc45)) +#loc125 = loc("tmp23"(#loc46)) +#loc126 = loc("tmp23"(#loc47)) +#loc127 = loc("tmp23"(#loc48)) +#loc128 = loc("tmp23"(#loc49)) +#loc129 = loc("tmp23"(#loc50)) +#loc130 = loc("tmp23"(#loc51)) +#loc131 = loc("tmp25"(#loc52)) +#loc132 = loc("tmp25"(#loc53)) +#loc133 = loc("tmp25"(#loc54)) +#loc134 = loc("tmp25"(#loc55)) +#loc135 = loc("tmp27"(#loc56)) +#loc136 = loc("tmp29"(#loc57)) +#loc137 = loc("tmp30"(#loc58)) +#loc138 = loc("tmp31"(#loc59)) +#loc139 = loc("tmp32"(#loc60)) +#loc140 = loc("tmp32"(#loc61)) +#loc141 = loc("tmp32"(#loc62)) +#loc142 = loc("tmp34"(#loc63)) +#loc143 = loc("tmp37"(#loc64)) +#loc144 = loc("tmp38"(#loc65)) diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e22017651a3036bf0f04930d296f26ec643ab728 --- /dev/null +++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.json"}} \ No newline at end of file diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..95e3feb2b7bdda770a4870af710475a3948cc107 Binary files /dev/null and b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.json new file mode 100644 index 0000000000000000000000000000000000000000..07e0d71041b3544e9f92b9122350fd767b89530a --- /dev/null +++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.json @@ -0,0 +1 @@ +{"hash": "aa70639f48f1d7d1f0b2054f16f663ad1882039e7134792fe6b6f1f467575fe7", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"} \ No newline at end of file diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..caaaf465ea07ddd0e7ac483af4ab4cab285b2271 --- /dev/null +++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir @@ -0,0 +1,1346 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10 + %15 = mul nuw i32 %13, %14, !dbg !11 + %16 = add nuw i32 %15, %12, !dbg !12 + %17 = shl i32 %16, 5, !dbg !13 + %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14 + %19 = lshr i32 %18, 4, !dbg !14 + %20 = and i32 %19, 15, !dbg !14 + %21 = and i32 %18, 7, !dbg !14 + %22 = shl nuw nsw i32 %21, 2, !dbg !14 + %23 = or disjoint i32 %17, %20, !dbg !15 + %24 = or disjoint i32 %23, 16, !dbg !15 + %25 = or disjoint i32 %17, %22, !dbg !15 + %26 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16 + %27 = shl i32 %26, 7, !dbg !17 + %28 = and i32 %18, 15, !dbg !18 + %29 = shl nuw nsw i32 %28, 3, !dbg !18 + %30 = lshr i32 %18, 3, !dbg !18 + %31 = and i32 %30, 31, !dbg !18 + %32 = or disjoint i32 %29, %27, !dbg !19 + %33 = or disjoint i32 %31, %27, !dbg !19 + %34 = icmp slt i32 %32, 128, !dbg !20 + %35 = icmp slt i32 %33, 128, !dbg !20 + %36 = sdiv i32 %23, 32, !dbg !21 + %37 = sdiv i32 %25, 32, !dbg !21 + %38 = mul i32 %36, 32, !dbg !22 + %.decomposed = sub i32 %23, %38, !dbg !22 + %39 = srem i32 %24, 32, !dbg !22 + %40 = mul i32 %37, 32, !dbg !22 + %.decomposed109 = sub i32 %25, %40, !dbg !22 + %41 = icmp slt i32 %23, 8192, !dbg !23 + %42 = icmp slt i32 %25, 8192, !dbg !23 + %43 = shl nsw i32 %.decomposed, 7, !dbg !24 + %44 = shl nsw i32 %39, 7, !dbg !24 + %45 = add i32 %43, %32, !dbg !25 + %46 = add i32 %44, %32, !dbg !25 + %47 = mul i32 %36, 12288, !dbg !26 + %48 = add i32 %45, %47, !dbg !27 + %49 = add i32 %46, %47, !dbg !27 + %50 = sext i32 %48 to i64, !dbg !28 + %51 = getelementptr bfloat, ptr addrspace(1) %0, i64 %50, !dbg !28 + %52 = sext i32 %49 to i64, !dbg !28 + %53 = getelementptr bfloat, ptr addrspace(1) %0, i64 %52, !dbg !28 + %54 = and i1 %34, %41, !dbg !29 + %55 = and i1 %35, %42, !dbg !29 + %56 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !30 + %57 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %51, i64 %56, i1 %54) #6, !dbg !30 + %58 = extractvalue { i32, i32, i32, i32 } %57, 0, !dbg !30 + %59 = extractvalue { i32, i32, i32, i32 } %57, 1, !dbg !30 + %60 = extractvalue { i32, i32, i32, i32 } %57, 2, !dbg !30 + %61 = extractvalue { i32, i32, i32, i32 } %57, 3, !dbg !30 + %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !30 + %63 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %53, i64 %62, i1 %54) #6, !dbg !30 + %64 = extractvalue { i32, i32, i32, i32 } %63, 0, !dbg !30 + %65 = extractvalue { i32, i32, i32, i32 } %63, 1, !dbg !30 + %66 = extractvalue { i32, i32, i32, i32 } %63, 2, !dbg !30 + %67 = extractvalue { i32, i32, i32, i32 } %63, 3, !dbg !30 + %68 = insertelement <2 x i32> poison, i32 %58, i64 0, !dbg !30 + %69 = insertelement <2 x i32> %68, i32 %64, i64 1, !dbg !30 + %70 = lshr <2 x i32> %69, splat (i32 16), !dbg !30 + %71 = trunc nuw <2 x i32> %70 to <2 x i16>, !dbg !30 + %72 = insertelement <2 x i32> poison, i32 %59, i64 0, !dbg !30 + %73 = insertelement <2 x i32> %72, i32 %65, i64 1, !dbg !30 + %74 = lshr <2 x i32> %73, splat (i32 16), !dbg !30 + %75 = trunc nuw <2 x i32> %74 to <2 x i16>, !dbg !30 + %76 = insertelement <2 x i32> poison, i32 %60, i64 0, !dbg !30 + %77 = insertelement <2 x i32> %76, i32 %66, i64 1, !dbg !30 + %78 = lshr <2 x i32> %77, splat (i32 16), !dbg !30 + %79 = trunc nuw <2 x i32> %78 to <2 x i16>, !dbg !30 + %80 = insertelement <2 x i32> poison, i32 %61, i64 0, !dbg !30 + %81 = insertelement <2 x i32> %80, i32 %67, i64 1, !dbg !30 + %82 = lshr <2 x i32> %81, splat (i32 16), !dbg !30 + %83 = trunc nuw <2 x i32> %82 to <2 x i16>, !dbg !30 + %84 = and i32 %18, 192, !dbg !31 + %85 = shl nuw nsw i32 %84, 5, !dbg !31 + %86 = shl nuw nsw i32 %21, 4, !dbg !31 + %87 = lshr exact i32 %84, 1, !dbg !31 + %88 = shl nuw nsw i32 %18, 6, !dbg !31 + %89 = and i32 %88, 512, !dbg !31 + %90 = and i32 %18, 16, !dbg !31 + %91 = icmp eq i32 %90, 0, !dbg !31 + %92 = select i1 %91, i32 0, i32 1040, !dbg !31 + %93 = and i32 %18, 32, !dbg !31 + %94 = shl nuw nsw i32 %93, 2, !dbg !31 + %95 = or disjoint i32 %85, %86, !dbg !31 + %96 = or disjoint i32 %92, %87, !dbg !31 + %97 = xor i32 %96, %95, !dbg !31 + %98 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %89, !dbg !31 + %99 = getelementptr inbounds nuw i8, ptr addrspace(3) %98, i32 %97, !dbg !31 + %100 = getelementptr inbounds nuw i8, ptr addrspace(3) %99, i32 %94, !dbg !31 + %101 = trunc i32 %58 to i16, !dbg !31 + %102 = trunc i32 %64 to i16, !dbg !31 + %103 = trunc i32 %59 to i16, !dbg !31 + %104 = trunc i32 %65 to i16, !dbg !31 + %105 = insertelement <2 x i16> poison, i16 %101, i64 0, !dbg !31 + %106 = insertelement <2 x i16> %105, i16 %102, i64 1, !dbg !31 + %107 = bitcast <2 x i16> %106 to i32, !dbg !31 + %108 = bitcast <2 x i16> %71 to i32, !dbg !31 + %109 = insertelement <2 x i16> poison, i16 %103, i64 0, !dbg !31 + %110 = insertelement <2 x i16> %109, i16 %104, i64 1, !dbg !31 + %111 = bitcast <2 x i16> %110 to i32, !dbg !31 + %112 = bitcast <2 x i16> %75 to i32, !dbg !31 + %113 = insertelement <4 x i32> poison, i32 %107, i64 0, !dbg !31 + %114 = insertelement <4 x i32> %113, i32 %108, i64 1, !dbg !31 + %115 = insertelement <4 x i32> %114, i32 %111, i64 2, !dbg !31 + %116 = insertelement <4 x i32> %115, i32 %112, i64 3, !dbg !31 + store <4 x i32> %116, ptr addrspace(3) %100, align 16, !dbg !31 + %117 = getelementptr inbounds nuw i8, ptr addrspace(3) %100, i32 256, !dbg !31 + %118 = trunc i32 %60 to i16, !dbg !31 + %119 = trunc i32 %66 to i16, !dbg !31 + %120 = trunc i32 %61 to i16, !dbg !31 + %121 = trunc i32 %67 to i16, !dbg !31 + %122 = insertelement <2 x i16> poison, i16 %118, i64 0, !dbg !31 + %123 = insertelement <2 x i16> %122, i16 %119, i64 1, !dbg !31 + %124 = bitcast <2 x i16> %123 to i32, !dbg !31 + %125 = bitcast <2 x i16> %79 to i32, !dbg !31 + %126 = insertelement <2 x i16> poison, i16 %120, i64 0, !dbg !31 + %127 = insertelement <2 x i16> %126, i16 %121, i64 1, !dbg !31 + %128 = bitcast <2 x i16> %127 to i32, !dbg !31 + %129 = bitcast <2 x i16> %83 to i32, !dbg !31 + %130 = insertelement <4 x i32> poison, i32 %124, i64 0, !dbg !31 + %131 = insertelement <4 x i32> %130, i32 %125, i64 1, !dbg !31 + %132 = insertelement <4 x i32> %131, i32 %128, i64 2, !dbg !31 + %133 = insertelement <4 x i32> %132, i32 %129, i64 3, !dbg !31 + store <4 x i32> %133, ptr addrspace(3) %117, align 16, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31 + %134 = shl nuw nsw i32 %21, 10, !dbg !31 + %135 = shl nuw nsw i32 %28, 4, !dbg !31 + %136 = lshr exact i32 %84, 2, !dbg !31 + %137 = shl nuw nsw i32 %90, 2, !dbg !31 + %138 = shl nuw nsw i32 %93, 3, !dbg !31 + %139 = xor i32 %135, %136, !dbg !31 + %140 = xor i32 %139, %137, !dbg !31 + %141 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %134, !dbg !31 + %142 = getelementptr inbounds nuw i8, ptr addrspace(3) %141, i32 %140, !dbg !31 + %143 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 %138, !dbg !31 + %144 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) %143), !dbg !31 + %145 = extractvalue { i32, i32, i32, i32 } %144, 0, !dbg !31 + %146 = bitcast i32 %145 to <2 x bfloat>, !dbg !31 + %147 = extractelement <2 x bfloat> %146, i64 0, !dbg !31 + %148 = extractelement <2 x bfloat> %146, i64 1, !dbg !31 + %149 = extractvalue { i32, i32, i32, i32 } %144, 1, !dbg !31 + %150 = bitcast i32 %149 to <2 x bfloat>, !dbg !31 + %151 = extractelement <2 x bfloat> %150, i64 0, !dbg !31 + %152 = extractelement <2 x bfloat> %150, i64 1, !dbg !31 + %153 = extractvalue { i32, i32, i32, i32 } %144, 2, !dbg !31 + %154 = bitcast i32 %153 to <2 x bfloat>, !dbg !31 + %155 = extractelement <2 x bfloat> %154, i64 0, !dbg !31 + %156 = extractelement <2 x bfloat> %154, i64 1, !dbg !31 + %157 = extractvalue { i32, i32, i32, i32 } %144, 3, !dbg !31 + %158 = bitcast i32 %157 to <2 x bfloat>, !dbg !31 + %159 = extractelement <2 x bfloat> %158, i64 0, !dbg !31 + %160 = extractelement <2 x bfloat> %158, i64 1, !dbg !31 + %161 = getelementptr inbounds nuw i8, ptr addrspace(3) %143, i32 512, !dbg !31 + %162 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) nonnull %161), !dbg !31 + %163 = extractvalue { i32, i32, i32, i32 } %162, 0, !dbg !31 + %164 = bitcast i32 %163 to <2 x bfloat>, !dbg !31 + %165 = extractelement <2 x bfloat> %164, i64 0, !dbg !31 + %166 = extractelement <2 x bfloat> %164, i64 1, !dbg !31 + %167 = extractvalue { i32, i32, i32, i32 } %162, 1, !dbg !31 + %168 = bitcast i32 %167 to <2 x bfloat>, !dbg !31 + %169 = extractelement <2 x bfloat> %168, i64 0, !dbg !31 + %170 = extractelement <2 x bfloat> %168, i64 1, !dbg !31 + %171 = extractvalue { i32, i32, i32, i32 } %162, 2, !dbg !31 + %172 = bitcast i32 %171 to <2 x bfloat>, !dbg !31 + %173 = extractelement <2 x bfloat> %172, i64 0, !dbg !31 + %174 = extractelement <2 x bfloat> %172, i64 1, !dbg !31 + %175 = extractvalue { i32, i32, i32, i32 } %162, 3, !dbg !31 + %176 = bitcast i32 %175 to <2 x bfloat>, !dbg !31 + %177 = extractelement <2 x bfloat> %176, i64 0, !dbg !31 + %178 = extractelement <2 x bfloat> %176, i64 1, !dbg !31 + %179 = fpext bfloat %147 to float, !dbg !31 + %180 = fpext bfloat %148 to float, !dbg !31 + %181 = fpext bfloat %151 to float, !dbg !31 + %182 = fpext bfloat %152 to float, !dbg !31 + %183 = fpext bfloat %155 to float, !dbg !31 + %184 = fpext bfloat %156 to float, !dbg !31 + %185 = fpext bfloat %159 to float, !dbg !31 + %186 = fpext bfloat %160 to float, !dbg !31 + %187 = fpext bfloat %165 to float, !dbg !31 + %188 = fpext bfloat %166 to float, !dbg !31 + %189 = fpext bfloat %169 to float, !dbg !31 + %190 = fpext bfloat %170 to float, !dbg !31 + %191 = fpext bfloat %173 to float, !dbg !31 + %192 = fpext bfloat %174 to float, !dbg !31 + %193 = fpext bfloat %177 to float, !dbg !31 + %194 = fpext bfloat %178 to float, !dbg !31 + %195 = sext i32 %25 to i64, !dbg !32 + %196 = getelementptr float, ptr addrspace(1) %1, i64 %195, !dbg !32 + %197 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !33 + %198 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %196, i64 %197, i1 %55) #6, !dbg !33 + %199 = extractvalue { i32, i32, i32, i32 } %198, 0, !dbg !33 + %200 = extractvalue { i32, i32, i32, i32 } %198, 1, !dbg !33 + %201 = extractvalue { i32, i32, i32, i32 } %198, 2, !dbg !33 + %202 = extractvalue { i32, i32, i32, i32 } %198, 3, !dbg !33 + %203 = bitcast i32 %199 to float, !dbg !33 + %204 = bitcast i32 %200 to float, !dbg !33 + %205 = bitcast i32 %201 to float, !dbg !33 + %206 = bitcast i32 %202 to float, !dbg !33 + %207 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !33 + %208 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %196, i64 %207, i1 %55) #6, !dbg !33 + %209 = extractvalue { i32, i32, i32, i32 } %208, 0, !dbg !33 + %210 = extractvalue { i32, i32, i32, i32 } %208, 1, !dbg !33 + %211 = extractvalue { i32, i32, i32, i32 } %208, 2, !dbg !33 + %212 = extractvalue { i32, i32, i32, i32 } %208, 3, !dbg !33 + %213 = bitcast i32 %209 to float, !dbg !33 + %214 = bitcast i32 %210 to float, !dbg !33 + %215 = bitcast i32 %211 to float, !dbg !33 + %216 = bitcast i32 %212 to float, !dbg !33 + %217 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !33 + %218 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %196, i64 %217, i1 %55) #6, !dbg !33 + %219 = extractvalue { i32, i32, i32, i32 } %218, 0, !dbg !33 + %220 = extractvalue { i32, i32, i32, i32 } %218, 1, !dbg !33 + %221 = extractvalue { i32, i32, i32, i32 } %218, 2, !dbg !33 + %222 = extractvalue { i32, i32, i32, i32 } %218, 3, !dbg !33 + %223 = bitcast i32 %219 to float, !dbg !33 + %224 = bitcast i32 %220 to float, !dbg !33 + %225 = bitcast i32 %221 to float, !dbg !33 + %226 = bitcast i32 %222 to float, !dbg !33 + %227 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !33 + %228 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %196, i64 %227, i1 %55) #6, !dbg !33 + %229 = extractvalue { i32, i32, i32, i32 } %228, 0, !dbg !33 + %230 = extractvalue { i32, i32, i32, i32 } %228, 1, !dbg !33 + %231 = extractvalue { i32, i32, i32, i32 } %228, 2, !dbg !33 + %232 = extractvalue { i32, i32, i32, i32 } %228, 3, !dbg !33 + %233 = bitcast i32 %229 to float, !dbg !33 + %234 = bitcast i32 %230 to float, !dbg !33 + %235 = bitcast i32 %231 to float, !dbg !33 + %236 = bitcast i32 %232 to float, !dbg !33 + %237 = tail call float @llvm.nvvm.div.full(float %203, float 1.280000e+02), !dbg !34 + %238 = tail call float @llvm.nvvm.div.full(float %204, float 1.280000e+02), !dbg !34 + %239 = tail call float @llvm.nvvm.div.full(float %205, float 1.280000e+02), !dbg !34 + %240 = tail call float @llvm.nvvm.div.full(float %206, float 1.280000e+02), !dbg !34 + %241 = tail call float @llvm.nvvm.div.full(float %213, float 1.280000e+02), !dbg !34 + %242 = tail call float @llvm.nvvm.div.full(float %214, float 1.280000e+02), !dbg !34 + %243 = tail call float @llvm.nvvm.div.full(float %215, float 1.280000e+02), !dbg !34 + %244 = tail call float @llvm.nvvm.div.full(float %216, float 1.280000e+02), !dbg !34 + %245 = tail call float @llvm.nvvm.div.full(float %223, float 1.280000e+02), !dbg !34 + %246 = tail call float @llvm.nvvm.div.full(float %224, float 1.280000e+02), !dbg !34 + %247 = tail call float @llvm.nvvm.div.full(float %225, float 1.280000e+02), !dbg !34 + %248 = tail call float @llvm.nvvm.div.full(float %226, float 1.280000e+02), !dbg !34 + %249 = tail call float @llvm.nvvm.div.full(float %233, float 1.280000e+02), !dbg !34 + %250 = tail call float @llvm.nvvm.div.full(float %234, float 1.280000e+02), !dbg !34 + %251 = tail call float @llvm.nvvm.div.full(float %235, float 1.280000e+02), !dbg !34 + %252 = tail call float @llvm.nvvm.div.full(float %236, float 1.280000e+02), !dbg !34 + %253 = fadd float %237, 0x3EB0C6F7A0000000, !dbg !35 + %254 = fadd float %238, 0x3EB0C6F7A0000000, !dbg !35 + %255 = fadd float %239, 0x3EB0C6F7A0000000, !dbg !35 + %256 = fadd float %240, 0x3EB0C6F7A0000000, !dbg !35 + %257 = fadd float %241, 0x3EB0C6F7A0000000, !dbg !35 + %258 = fadd float %242, 0x3EB0C6F7A0000000, !dbg !35 + %259 = fadd float %243, 0x3EB0C6F7A0000000, !dbg !35 + %260 = fadd float %244, 0x3EB0C6F7A0000000, !dbg !35 + %261 = fadd float %245, 0x3EB0C6F7A0000000, !dbg !35 + %262 = fadd float %246, 0x3EB0C6F7A0000000, !dbg !35 + %263 = fadd float %247, 0x3EB0C6F7A0000000, !dbg !35 + %264 = fadd float %248, 0x3EB0C6F7A0000000, !dbg !35 + %265 = fadd float %249, 0x3EB0C6F7A0000000, !dbg !35 + %266 = fadd float %250, 0x3EB0C6F7A0000000, !dbg !35 + %267 = fadd float %251, 0x3EB0C6F7A0000000, !dbg !35 + %268 = fadd float %252, 0x3EB0C6F7A0000000, !dbg !35 + %269 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i = icmp eq i32 %269, 0, !dbg !36 + br i1 %.not.i, label %272, label %270, !dbg !36 + +270: ; preds = %11 + %271 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %253), !dbg !36 + br label %__nv_rsqrtf.exit, !dbg !36 + +272: ; preds = %11 + %273 = tail call float @llvm.nvvm.rsqrt.approx.f(float %253), !dbg !36 + br label %__nv_rsqrtf.exit, !dbg !36 + +__nv_rsqrtf.exit: ; preds = %270, %272 + %.0.i = phi float [ %271, %270 ], [ %273, %272 ], !dbg !36 + %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i16 = icmp eq i32 %274, 0, !dbg !36 + br i1 %.not.i16, label %277, label %275, !dbg !36 + +275: ; preds = %__nv_rsqrtf.exit + %276 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %254), !dbg !36 + br label %__nv_rsqrtf.exit18, !dbg !36 + +277: ; preds = %__nv_rsqrtf.exit + %278 = tail call float @llvm.nvvm.rsqrt.approx.f(float %254), !dbg !36 + br label %__nv_rsqrtf.exit18, !dbg !36 + +__nv_rsqrtf.exit18: ; preds = %275, %277 + %.0.i17 = phi float [ %276, %275 ], [ %278, %277 ], !dbg !36 + %279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i19 = icmp eq i32 %279, 0, !dbg !36 + br i1 %.not.i19, label %282, label %280, !dbg !36 + +280: ; preds = %__nv_rsqrtf.exit18 + %281 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %255), !dbg !36 + br label %__nv_rsqrtf.exit21, !dbg !36 + +282: ; preds = %__nv_rsqrtf.exit18 + %283 = tail call float @llvm.nvvm.rsqrt.approx.f(float %255), !dbg !36 + br label %__nv_rsqrtf.exit21, !dbg !36 + +__nv_rsqrtf.exit21: ; preds = %280, %282 + %.0.i20 = phi float [ %281, %280 ], [ %283, %282 ], !dbg !36 + %284 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i22 = icmp eq i32 %284, 0, !dbg !36 + br i1 %.not.i22, label %287, label %285, !dbg !36 + +285: ; preds = %__nv_rsqrtf.exit21 + %286 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %256), !dbg !36 + br label %__nv_rsqrtf.exit24, !dbg !36 + +287: ; preds = %__nv_rsqrtf.exit21 + %288 = tail call float @llvm.nvvm.rsqrt.approx.f(float %256), !dbg !36 + br label %__nv_rsqrtf.exit24, !dbg !36 + +__nv_rsqrtf.exit24: ; preds = %285, %287 + %.0.i23 = phi float [ %286, %285 ], [ %288, %287 ], !dbg !36 + %289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i25 = icmp eq i32 %289, 0, !dbg !36 + br i1 %.not.i25, label %292, label %290, !dbg !36 + +290: ; preds = %__nv_rsqrtf.exit24 + %291 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %257), !dbg !36 + br label %__nv_rsqrtf.exit27, !dbg !36 + +292: ; preds = %__nv_rsqrtf.exit24 + %293 = tail call float @llvm.nvvm.rsqrt.approx.f(float %257), !dbg !36 + br label %__nv_rsqrtf.exit27, !dbg !36 + +__nv_rsqrtf.exit27: ; preds = %290, %292 + %.0.i26 = phi float [ %291, %290 ], [ %293, %292 ], !dbg !36 + %294 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i28 = icmp eq i32 %294, 0, !dbg !36 + br i1 %.not.i28, label %297, label %295, !dbg !36 + +295: ; preds = %__nv_rsqrtf.exit27 + %296 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %258), !dbg !36 + br label %__nv_rsqrtf.exit30, !dbg !36 + +297: ; preds = %__nv_rsqrtf.exit27 + %298 = tail call float @llvm.nvvm.rsqrt.approx.f(float %258), !dbg !36 + br label %__nv_rsqrtf.exit30, !dbg !36 + +__nv_rsqrtf.exit30: ; preds = %295, %297 + %.0.i29 = phi float [ %296, %295 ], [ %298, %297 ], !dbg !36 + %299 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i31 = icmp eq i32 %299, 0, !dbg !36 + br i1 %.not.i31, label %302, label %300, !dbg !36 + +300: ; preds = %__nv_rsqrtf.exit30 + %301 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %259), !dbg !36 + br label %__nv_rsqrtf.exit33, !dbg !36 + +302: ; preds = %__nv_rsqrtf.exit30 + %303 = tail call float @llvm.nvvm.rsqrt.approx.f(float %259), !dbg !36 + br label %__nv_rsqrtf.exit33, !dbg !36 + +__nv_rsqrtf.exit33: ; preds = %300, %302 + %.0.i32 = phi float [ %301, %300 ], [ %303, %302 ], !dbg !36 + %304 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i34 = icmp eq i32 %304, 0, !dbg !36 + br i1 %.not.i34, label %307, label %305, !dbg !36 + +305: ; preds = %__nv_rsqrtf.exit33 + %306 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %260), !dbg !36 + br label %__nv_rsqrtf.exit36, !dbg !36 + +307: ; preds = %__nv_rsqrtf.exit33 + %308 = tail call float @llvm.nvvm.rsqrt.approx.f(float %260), !dbg !36 + br label %__nv_rsqrtf.exit36, !dbg !36 + +__nv_rsqrtf.exit36: ; preds = %305, %307 + %.0.i35 = phi float [ %306, %305 ], [ %308, %307 ], !dbg !36 + %309 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i37 = icmp eq i32 %309, 0, !dbg !36 + br i1 %.not.i37, label %312, label %310, !dbg !36 + +310: ; preds = %__nv_rsqrtf.exit36 + %311 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %261), !dbg !36 + br label %__nv_rsqrtf.exit39, !dbg !36 + +312: ; preds = %__nv_rsqrtf.exit36 + %313 = tail call float @llvm.nvvm.rsqrt.approx.f(float %261), !dbg !36 + br label %__nv_rsqrtf.exit39, !dbg !36 + +__nv_rsqrtf.exit39: ; preds = %310, %312 + %.0.i38 = phi float [ %311, %310 ], [ %313, %312 ], !dbg !36 + %314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i40 = icmp eq i32 %314, 0, !dbg !36 + br i1 %.not.i40, label %317, label %315, !dbg !36 + +315: ; preds = %__nv_rsqrtf.exit39 + %316 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %262), !dbg !36 + br label %__nv_rsqrtf.exit42, !dbg !36 + +317: ; preds = %__nv_rsqrtf.exit39 + %318 = tail call float @llvm.nvvm.rsqrt.approx.f(float %262), !dbg !36 + br label %__nv_rsqrtf.exit42, !dbg !36 + +__nv_rsqrtf.exit42: ; preds = %315, %317 + %.0.i41 = phi float [ %316, %315 ], [ %318, %317 ], !dbg !36 + %319 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i43 = icmp eq i32 %319, 0, !dbg !36 + br i1 %.not.i43, label %322, label %320, !dbg !36 + +320: ; preds = %__nv_rsqrtf.exit42 + %321 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %263), !dbg !36 + br label %__nv_rsqrtf.exit45, !dbg !36 + +322: ; preds = %__nv_rsqrtf.exit42 + %323 = tail call float @llvm.nvvm.rsqrt.approx.f(float %263), !dbg !36 + br label %__nv_rsqrtf.exit45, !dbg !36 + +__nv_rsqrtf.exit45: ; preds = %320, %322 + %.0.i44 = phi float [ %321, %320 ], [ %323, %322 ], !dbg !36 + %324 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i46 = icmp eq i32 %324, 0, !dbg !36 + br i1 %.not.i46, label %327, label %325, !dbg !36 + +325: ; preds = %__nv_rsqrtf.exit45 + %326 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %264), !dbg !36 + br label %__nv_rsqrtf.exit48, !dbg !36 + +327: ; preds = %__nv_rsqrtf.exit45 + %328 = tail call float @llvm.nvvm.rsqrt.approx.f(float %264), !dbg !36 + br label %__nv_rsqrtf.exit48, !dbg !36 + +__nv_rsqrtf.exit48: ; preds = %325, %327 + %.0.i47 = phi float [ %326, %325 ], [ %328, %327 ], !dbg !36 + %329 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i49 = icmp eq i32 %329, 0, !dbg !36 + br i1 %.not.i49, label %332, label %330, !dbg !36 + +330: ; preds = %__nv_rsqrtf.exit48 + %331 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %265), !dbg !36 + br label %__nv_rsqrtf.exit51, !dbg !36 + +332: ; preds = %__nv_rsqrtf.exit48 + %333 = tail call float @llvm.nvvm.rsqrt.approx.f(float %265), !dbg !36 + br label %__nv_rsqrtf.exit51, !dbg !36 + +__nv_rsqrtf.exit51: ; preds = %330, %332 + %.0.i50 = phi float [ %331, %330 ], [ %333, %332 ], !dbg !36 + %334 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i52 = icmp eq i32 %334, 0, !dbg !36 + br i1 %.not.i52, label %337, label %335, !dbg !36 + +335: ; preds = %__nv_rsqrtf.exit51 + %336 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %266), !dbg !36 + br label %__nv_rsqrtf.exit54, !dbg !36 + +337: ; preds = %__nv_rsqrtf.exit51 + %338 = tail call float @llvm.nvvm.rsqrt.approx.f(float %266), !dbg !36 + br label %__nv_rsqrtf.exit54, !dbg !36 + +__nv_rsqrtf.exit54: ; preds = %335, %337 + %.0.i53 = phi float [ %336, %335 ], [ %338, %337 ], !dbg !36 + %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i55 = icmp eq i32 %339, 0, !dbg !36 + br i1 %.not.i55, label %342, label %340, !dbg !36 + +340: ; preds = %__nv_rsqrtf.exit54 + %341 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %267), !dbg !36 + br label %__nv_rsqrtf.exit57, !dbg !36 + +342: ; preds = %__nv_rsqrtf.exit54 + %343 = tail call float @llvm.nvvm.rsqrt.approx.f(float %267), !dbg !36 + br label %__nv_rsqrtf.exit57, !dbg !36 + +__nv_rsqrtf.exit57: ; preds = %340, %342 + %.0.i56 = phi float [ %341, %340 ], [ %343, %342 ], !dbg !36 + %344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i58 = icmp eq i32 %344, 0, !dbg !36 + br i1 %.not.i58, label %347, label %345, !dbg !36 + +345: ; preds = %__nv_rsqrtf.exit57 + %346 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %268), !dbg !36 + br label %__nv_rsqrtf.exit60, !dbg !36 + +347: ; preds = %__nv_rsqrtf.exit57 + %348 = tail call float @llvm.nvvm.rsqrt.approx.f(float %268), !dbg !36 + br label %__nv_rsqrtf.exit60, !dbg !36 + +__nv_rsqrtf.exit60: ; preds = %345, %347 + %.0.i59 = phi float [ %346, %345 ], [ %348, %347 ], !dbg !36 + %349 = fmul float %.0.i, %179, !dbg !37 + %350 = fmul float %.0.i17, %180, !dbg !37 + %351 = fmul float %.0.i20, %181, !dbg !37 + %352 = fmul float %.0.i23, %182, !dbg !37 + %353 = fmul float %.0.i26, %183, !dbg !37 + %354 = fmul float %.0.i29, %184, !dbg !37 + %355 = fmul float %.0.i32, %185, !dbg !37 + %356 = fmul float %.0.i35, %186, !dbg !37 + %357 = fmul float %.0.i38, %187, !dbg !37 + %358 = fmul float %.0.i41, %188, !dbg !37 + %359 = fmul float %.0.i44, %189, !dbg !37 + %360 = fmul float %.0.i47, %190, !dbg !37 + %361 = fmul float %.0.i50, %191, !dbg !37 + %362 = fmul float %.0.i53, %192, !dbg !37 + %363 = fmul float %.0.i56, %193, !dbg !37 + %364 = fmul float %.0.i59, %194, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %365 = shl nuw nsw i32 %18, 9, !dbg !37 + %366 = and i32 %365, 15360, !dbg !37 + %367 = lshr i32 %18, 1, !dbg !37 + %368 = and i32 %367, 108, !dbg !37 + %369 = or disjoint i32 %366, %86, !dbg !37 + %370 = xor i32 %369, %368, !dbg !37 + %371 = or disjoint i32 %370, %138, !dbg !37 + %372 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %371, !dbg !37 + store float %349, ptr addrspace(3) %372, align 4, !dbg !37 + %373 = getelementptr inbounds nuw i8, ptr addrspace(3) %372, i32 128, !dbg !37 + store float %351, ptr addrspace(3) %373, align 4, !dbg !37 + %374 = xor i32 %371, 528, !dbg !37 + %375 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %374, !dbg !37 + store float %350, ptr addrspace(3) %375, align 4, !dbg !37 + %376 = getelementptr inbounds nuw i8, ptr addrspace(3) %375, i32 128, !dbg !37 + store float %352, ptr addrspace(3) %376, align 4, !dbg !37 + %377 = xor i32 %371, 4, !dbg !37 + %378 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %377, !dbg !37 + store float %353, ptr addrspace(3) %378, align 4, !dbg !37 + %379 = getelementptr inbounds nuw i8, ptr addrspace(3) %378, i32 128, !dbg !37 + store float %355, ptr addrspace(3) %379, align 4, !dbg !37 + %380 = xor i32 %371, 532, !dbg !37 + %381 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %380, !dbg !37 + store float %354, ptr addrspace(3) %381, align 4, !dbg !37 + %382 = getelementptr inbounds nuw i8, ptr addrspace(3) %381, i32 128, !dbg !37 + store float %356, ptr addrspace(3) %382, align 4, !dbg !37 + %383 = xor i32 %371, 8, !dbg !37 + %384 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %383, !dbg !37 + store float %357, ptr addrspace(3) %384, align 4, !dbg !37 + %385 = getelementptr inbounds nuw i8, ptr addrspace(3) %384, i32 128, !dbg !37 + store float %359, ptr addrspace(3) %385, align 4, !dbg !37 + %386 = xor i32 %371, 536, !dbg !37 + %387 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %386, !dbg !37 + store float %358, ptr addrspace(3) %387, align 4, !dbg !37 + %388 = getelementptr inbounds nuw i8, ptr addrspace(3) %387, i32 128, !dbg !37 + store float %360, ptr addrspace(3) %388, align 4, !dbg !37 + %389 = xor i32 %371, 12, !dbg !37 + %390 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %389, !dbg !37 + store float %361, ptr addrspace(3) %390, align 4, !dbg !37 + %391 = getelementptr inbounds nuw i8, ptr addrspace(3) %390, i32 128, !dbg !37 + store float %363, ptr addrspace(3) %391, align 4, !dbg !37 + %392 = xor i32 %371, 540, !dbg !37 + %393 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %392, !dbg !37 + store float %362, ptr addrspace(3) %393, align 4, !dbg !37 + %394 = getelementptr inbounds nuw i8, ptr addrspace(3) %393, i32 128, !dbg !37 + store float %364, ptr addrspace(3) %394, align 4, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %395 = shl nuw nsw i32 %18, 5, !dbg !37 + %396 = and i32 %395, 608, !dbg !37 + %397 = and i32 %18, 28, !dbg !37 + %398 = lshr i32 %18, 2, !dbg !37 + %399 = and i32 %398, 16, !dbg !37 + %400 = and i32 %18, 128, !dbg !37 + %401 = icmp eq i32 %400, 0, !dbg !37 + %402 = select i1 %401, i32 0, i32 1056, !dbg !37 + %403 = or disjoint i32 %396, %397, !dbg !37 + %404 = or disjoint i32 %402, %399, !dbg !37 + %405 = xor i32 %404, %403, !dbg !37 + %406 = or disjoint i32 %405, %94, !dbg !37 + %407 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %406, !dbg !37 + %408 = load float, ptr addrspace(3) %407, align 4, !dbg !37 + %409 = getelementptr inbounds nuw i8, ptr addrspace(3) %407, i32 256, !dbg !37 + %410 = load float, ptr addrspace(3) %409, align 4, !dbg !37 + %411 = xor i32 %406, 4100, !dbg !37 + %412 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %411, !dbg !37 + %413 = load float, ptr addrspace(3) %412, align 4, !dbg !37 + %414 = getelementptr inbounds nuw i8, ptr addrspace(3) %412, i32 256, !dbg !37 + %415 = load float, ptr addrspace(3) %414, align 4, !dbg !37 + %416 = xor i32 %406, 8200, !dbg !37 + %417 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %416, !dbg !37 + %418 = load float, ptr addrspace(3) %417, align 4, !dbg !37 + %419 = getelementptr inbounds nuw i8, ptr addrspace(3) %417, i32 256, !dbg !37 + %420 = load float, ptr addrspace(3) %419, align 4, !dbg !37 + %421 = xor i32 %406, 12300, !dbg !37 + %422 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %421, !dbg !37 + %423 = load float, ptr addrspace(3) %422, align 4, !dbg !37 + %424 = getelementptr inbounds nuw i8, ptr addrspace(3) %422, i32 256, !dbg !37 + %425 = load float, ptr addrspace(3) %424, align 4, !dbg !37 + %426 = xor i32 %406, 2112, !dbg !37 + %427 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %426, !dbg !37 + %428 = load float, ptr addrspace(3) %427, align 4, !dbg !37 + %429 = getelementptr inbounds nuw i8, ptr addrspace(3) %427, i32 256, !dbg !37 + %430 = load float, ptr addrspace(3) %429, align 4, !dbg !37 + %431 = xor i32 %406, 6212, !dbg !37 + %432 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %431, !dbg !37 + %433 = load float, ptr addrspace(3) %432, align 4, !dbg !37 + %434 = getelementptr inbounds nuw i8, ptr addrspace(3) %432, i32 256, !dbg !37 + %435 = load float, ptr addrspace(3) %434, align 4, !dbg !37 + %436 = xor i32 %406, 10312, !dbg !37 + %437 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %436, !dbg !37 + %438 = load float, ptr addrspace(3) %437, align 4, !dbg !37 + %439 = getelementptr inbounds nuw i8, ptr addrspace(3) %437, i32 256, !dbg !37 + %440 = load float, ptr addrspace(3) %439, align 4, !dbg !37 + %441 = xor i32 %406, 14412, !dbg !37 + %442 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %441, !dbg !37 + %443 = load float, ptr addrspace(3) %442, align 4, !dbg !37 + %444 = getelementptr inbounds nuw i8, ptr addrspace(3) %442, i32 256, !dbg !37 + %445 = load float, ptr addrspace(3) %444, align 4, !dbg !37 + %446 = sext i32 %32 to i64, !dbg !38 + %447 = getelementptr bfloat, ptr addrspace(1) %2, i64 %446, !dbg !38 + %448 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !39 + %449 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %447, i64 %448, i1 %54) #6, !dbg !39 + %450 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !39 + %451 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %447, i64 %450, i1 %54) #6, !dbg !39 + %452 = add i32 %47, -3145728, !dbg !40 + %453 = add i32 %45, %452, !dbg !41 + %454 = add i32 %46, %452, !dbg !41 + %455 = sext i32 %453 to i64, !dbg !42 + %456 = getelementptr bfloat, ptr addrspace(1) %3, i64 %455, !dbg !42 + %457 = sext i32 %454 to i64, !dbg !42 + %458 = getelementptr bfloat, ptr addrspace(1) %3, i64 %457, !dbg !42 + %459 = add i32 %17, -8192, !dbg !43 + %460 = icmp ult i32 %459, 65536, !dbg !43 + %461 = and i1 %34, %460, !dbg !43 + %462 = and i1 %35, %460, !dbg !43 + %463 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !44 + %464 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %456, i64 %463, i1 %461) #6, !dbg !44 + %465 = extractvalue { i32, i32, i32, i32 } %464, 0, !dbg !44 + %466 = extractvalue { i32, i32, i32, i32 } %464, 1, !dbg !44 + %467 = extractvalue { i32, i32, i32, i32 } %464, 2, !dbg !44 + %468 = extractvalue { i32, i32, i32, i32 } %464, 3, !dbg !44 + %469 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !44 + %470 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %458, i64 %469, i1 %461) #6, !dbg !44 + %471 = extractvalue { i32, i32, i32, i32 } %470, 0, !dbg !44 + %472 = extractvalue { i32, i32, i32, i32 } %470, 1, !dbg !44 + %473 = extractvalue { i32, i32, i32, i32 } %470, 2, !dbg !44 + %474 = extractvalue { i32, i32, i32, i32 } %470, 3, !dbg !44 + %475 = insertelement <2 x i32> poison, i32 %465, i64 0, !dbg !44 + %476 = insertelement <2 x i32> %475, i32 %471, i64 1, !dbg !44 + %477 = lshr <2 x i32> %476, splat (i32 16), !dbg !44 + %478 = trunc nuw <2 x i32> %477 to <2 x i16>, !dbg !44 + %479 = insertelement <2 x i32> poison, i32 %466, i64 0, !dbg !44 + %480 = insertelement <2 x i32> %479, i32 %472, i64 1, !dbg !44 + %481 = lshr <2 x i32> %480, splat (i32 16), !dbg !44 + %482 = trunc nuw <2 x i32> %481 to <2 x i16>, !dbg !44 + %483 = insertelement <2 x i32> poison, i32 %467, i64 0, !dbg !44 + %484 = insertelement <2 x i32> %483, i32 %473, i64 1, !dbg !44 + %485 = lshr <2 x i32> %484, splat (i32 16), !dbg !44 + %486 = trunc nuw <2 x i32> %485 to <2 x i16>, !dbg !44 + %487 = insertelement <2 x i32> poison, i32 %468, i64 0, !dbg !44 + %488 = insertelement <2 x i32> %487, i32 %474, i64 1, !dbg !44 + %489 = lshr <2 x i32> %488, splat (i32 16), !dbg !44 + %490 = trunc nuw <2 x i32> %489 to <2 x i16>, !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %491 = trunc i32 %465 to i16, !dbg !45 + %492 = trunc i32 %471 to i16, !dbg !45 + %493 = trunc i32 %466 to i16, !dbg !45 + %494 = trunc i32 %472 to i16, !dbg !45 + %495 = insertelement <2 x i16> poison, i16 %491, i64 0, !dbg !45 + %496 = insertelement <2 x i16> %495, i16 %492, i64 1, !dbg !45 + %497 = bitcast <2 x i16> %496 to i32, !dbg !45 + %498 = bitcast <2 x i16> %478 to i32, !dbg !45 + %499 = insertelement <2 x i16> poison, i16 %493, i64 0, !dbg !45 + %500 = insertelement <2 x i16> %499, i16 %494, i64 1, !dbg !45 + %501 = bitcast <2 x i16> %500 to i32, !dbg !45 + %502 = bitcast <2 x i16> %482 to i32, !dbg !45 + %503 = insertelement <4 x i32> poison, i32 %497, i64 0, !dbg !45 + %504 = insertelement <4 x i32> %503, i32 %498, i64 1, !dbg !45 + %505 = insertelement <4 x i32> %504, i32 %501, i64 2, !dbg !45 + %506 = insertelement <4 x i32> %505, i32 %502, i64 3, !dbg !45 + store <4 x i32> %506, ptr addrspace(3) %100, align 16, !dbg !45 + %507 = trunc i32 %467 to i16, !dbg !45 + %508 = trunc i32 %473 to i16, !dbg !45 + %509 = trunc i32 %468 to i16, !dbg !45 + %510 = trunc i32 %474 to i16, !dbg !45 + %511 = insertelement <2 x i16> poison, i16 %507, i64 0, !dbg !45 + %512 = insertelement <2 x i16> %511, i16 %508, i64 1, !dbg !45 + %513 = bitcast <2 x i16> %512 to i32, !dbg !45 + %514 = bitcast <2 x i16> %486 to i32, !dbg !45 + %515 = insertelement <2 x i16> poison, i16 %509, i64 0, !dbg !45 + %516 = insertelement <2 x i16> %515, i16 %510, i64 1, !dbg !45 + %517 = bitcast <2 x i16> %516 to i32, !dbg !45 + %518 = bitcast <2 x i16> %490 to i32, !dbg !45 + %519 = insertelement <4 x i32> poison, i32 %513, i64 0, !dbg !45 + %520 = insertelement <4 x i32> %519, i32 %514, i64 1, !dbg !45 + %521 = insertelement <4 x i32> %520, i32 %517, i64 2, !dbg !45 + %522 = insertelement <4 x i32> %521, i32 %518, i64 3, !dbg !45 + store <4 x i32> %522, ptr addrspace(3) %117, align 16, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %523 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) %143), !dbg !45 + %524 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) nonnull %161), !dbg !45 + %525 = shl nsw i32 %37, 5, !dbg !46 + %526 = add nsw i32 %.decomposed109, -8192, !dbg !46 + %527 = add i32 %526, %525, !dbg !47 + %528 = sext i32 %527 to i64, !dbg !48 + %529 = getelementptr float, ptr addrspace(1) %4, i64 %528, !dbg !48 + %530 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %531 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %529, i64 %530, i1 %462) #6, !dbg !49 + %532 = extractvalue { i32, i32, i32, i32 } %531, 0, !dbg !49 + %533 = extractvalue { i32, i32, i32, i32 } %531, 1, !dbg !49 + %534 = extractvalue { i32, i32, i32, i32 } %531, 2, !dbg !49 + %535 = extractvalue { i32, i32, i32, i32 } %531, 3, !dbg !49 + %536 = bitcast i32 %532 to float, !dbg !49 + %537 = bitcast i32 %533 to float, !dbg !49 + %538 = bitcast i32 %534 to float, !dbg !49 + %539 = bitcast i32 %535 to float, !dbg !49 + %540 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %541 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %529, i64 %540, i1 %462) #6, !dbg !49 + %542 = extractvalue { i32, i32, i32, i32 } %541, 0, !dbg !49 + %543 = extractvalue { i32, i32, i32, i32 } %541, 1, !dbg !49 + %544 = extractvalue { i32, i32, i32, i32 } %541, 2, !dbg !49 + %545 = extractvalue { i32, i32, i32, i32 } %541, 3, !dbg !49 + %546 = bitcast i32 %542 to float, !dbg !49 + %547 = bitcast i32 %543 to float, !dbg !49 + %548 = bitcast i32 %544 to float, !dbg !49 + %549 = bitcast i32 %545 to float, !dbg !49 + %550 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %551 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %529, i64 %550, i1 %462) #6, !dbg !49 + %552 = extractvalue { i32, i32, i32, i32 } %551, 0, !dbg !49 + %553 = extractvalue { i32, i32, i32, i32 } %551, 1, !dbg !49 + %554 = extractvalue { i32, i32, i32, i32 } %551, 2, !dbg !49 + %555 = extractvalue { i32, i32, i32, i32 } %551, 3, !dbg !49 + %556 = bitcast i32 %552 to float, !dbg !49 + %557 = bitcast i32 %553 to float, !dbg !49 + %558 = bitcast i32 %554 to float, !dbg !49 + %559 = bitcast i32 %555 to float, !dbg !49 + %560 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49 + %561 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %529, i64 %560, i1 %462) #6, !dbg !49 + %562 = extractvalue { i32, i32, i32, i32 } %561, 0, !dbg !49 + %563 = extractvalue { i32, i32, i32, i32 } %561, 1, !dbg !49 + %564 = extractvalue { i32, i32, i32, i32 } %561, 2, !dbg !49 + %565 = extractvalue { i32, i32, i32, i32 } %561, 3, !dbg !49 + %566 = bitcast i32 %562 to float, !dbg !49 + %567 = bitcast i32 %563 to float, !dbg !49 + %568 = bitcast i32 %564 to float, !dbg !49 + %569 = bitcast i32 %565 to float, !dbg !49 + %570 = tail call float @llvm.nvvm.div.full(float %536, float 1.280000e+02), !dbg !50 + %571 = tail call float @llvm.nvvm.div.full(float %537, float 1.280000e+02), !dbg !50 + %572 = tail call float @llvm.nvvm.div.full(float %538, float 1.280000e+02), !dbg !50 + %573 = tail call float @llvm.nvvm.div.full(float %539, float 1.280000e+02), !dbg !50 + %574 = tail call float @llvm.nvvm.div.full(float %546, float 1.280000e+02), !dbg !50 + %575 = tail call float @llvm.nvvm.div.full(float %547, float 1.280000e+02), !dbg !50 + %576 = tail call float @llvm.nvvm.div.full(float %548, float 1.280000e+02), !dbg !50 + %577 = tail call float @llvm.nvvm.div.full(float %549, float 1.280000e+02), !dbg !50 + %578 = tail call float @llvm.nvvm.div.full(float %556, float 1.280000e+02), !dbg !50 + %579 = tail call float @llvm.nvvm.div.full(float %557, float 1.280000e+02), !dbg !50 + %580 = tail call float @llvm.nvvm.div.full(float %558, float 1.280000e+02), !dbg !50 + %581 = tail call float @llvm.nvvm.div.full(float %559, float 1.280000e+02), !dbg !50 + %582 = tail call float @llvm.nvvm.div.full(float %566, float 1.280000e+02), !dbg !50 + %583 = tail call float @llvm.nvvm.div.full(float %567, float 1.280000e+02), !dbg !50 + %584 = tail call float @llvm.nvvm.div.full(float %568, float 1.280000e+02), !dbg !50 + %585 = tail call float @llvm.nvvm.div.full(float %569, float 1.280000e+02), !dbg !50 + %586 = fadd float %570, 0x3EB0C6F7A0000000, !dbg !51 + %587 = fadd float %571, 0x3EB0C6F7A0000000, !dbg !51 + %588 = fadd float %572, 0x3EB0C6F7A0000000, !dbg !51 + %589 = fadd float %573, 0x3EB0C6F7A0000000, !dbg !51 + %590 = fadd float %574, 0x3EB0C6F7A0000000, !dbg !51 + %591 = fadd float %575, 0x3EB0C6F7A0000000, !dbg !51 + %592 = fadd float %576, 0x3EB0C6F7A0000000, !dbg !51 + %593 = fadd float %577, 0x3EB0C6F7A0000000, !dbg !51 + %594 = fadd float %578, 0x3EB0C6F7A0000000, !dbg !51 + %595 = fadd float %579, 0x3EB0C6F7A0000000, !dbg !51 + %596 = fadd float %580, 0x3EB0C6F7A0000000, !dbg !51 + %597 = fadd float %581, 0x3EB0C6F7A0000000, !dbg !51 + %598 = fadd float %582, 0x3EB0C6F7A0000000, !dbg !51 + %599 = fadd float %583, 0x3EB0C6F7A0000000, !dbg !51 + %600 = fadd float %584, 0x3EB0C6F7A0000000, !dbg !51 + %601 = fadd float %585, 0x3EB0C6F7A0000000, !dbg !51 + %602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i61 = icmp eq i32 %602, 0, !dbg !52 + br i1 %.not.i61, label %605, label %603, !dbg !52 + +603: ; preds = %__nv_rsqrtf.exit60 + %604 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %586), !dbg !52 + br label %__nv_rsqrtf.exit63, !dbg !52 + +605: ; preds = %__nv_rsqrtf.exit60 + %606 = tail call float @llvm.nvvm.rsqrt.approx.f(float %586), !dbg !52 + br label %__nv_rsqrtf.exit63, !dbg !52 + +__nv_rsqrtf.exit63: ; preds = %603, %605 + %.0.i62 = phi float [ %604, %603 ], [ %606, %605 ], !dbg !52 + %607 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i64 = icmp eq i32 %607, 0, !dbg !52 + br i1 %.not.i64, label %610, label %608, !dbg !52 + +608: ; preds = %__nv_rsqrtf.exit63 + %609 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %587), !dbg !52 + br label %__nv_rsqrtf.exit66, !dbg !52 + +610: ; preds = %__nv_rsqrtf.exit63 + %611 = tail call float @llvm.nvvm.rsqrt.approx.f(float %587), !dbg !52 + br label %__nv_rsqrtf.exit66, !dbg !52 + +__nv_rsqrtf.exit66: ; preds = %608, %610 + %.0.i65 = phi float [ %609, %608 ], [ %611, %610 ], !dbg !52 + %612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i67 = icmp eq i32 %612, 0, !dbg !52 + br i1 %.not.i67, label %615, label %613, !dbg !52 + +613: ; preds = %__nv_rsqrtf.exit66 + %614 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %588), !dbg !52 + br label %__nv_rsqrtf.exit69, !dbg !52 + +615: ; preds = %__nv_rsqrtf.exit66 + %616 = tail call float @llvm.nvvm.rsqrt.approx.f(float %588), !dbg !52 + br label %__nv_rsqrtf.exit69, !dbg !52 + +__nv_rsqrtf.exit69: ; preds = %613, %615 + %.0.i68 = phi float [ %614, %613 ], [ %616, %615 ], !dbg !52 + %617 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i70 = icmp eq i32 %617, 0, !dbg !52 + br i1 %.not.i70, label %620, label %618, !dbg !52 + +618: ; preds = %__nv_rsqrtf.exit69 + %619 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %589), !dbg !52 + br label %__nv_rsqrtf.exit72, !dbg !52 + +620: ; preds = %__nv_rsqrtf.exit69 + %621 = tail call float @llvm.nvvm.rsqrt.approx.f(float %589), !dbg !52 + br label %__nv_rsqrtf.exit72, !dbg !52 + +__nv_rsqrtf.exit72: ; preds = %618, %620 + %.0.i71 = phi float [ %619, %618 ], [ %621, %620 ], !dbg !52 + %622 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i73 = icmp eq i32 %622, 0, !dbg !52 + br i1 %.not.i73, label %625, label %623, !dbg !52 + +623: ; preds = %__nv_rsqrtf.exit72 + %624 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %590), !dbg !52 + br label %__nv_rsqrtf.exit75, !dbg !52 + +625: ; preds = %__nv_rsqrtf.exit72 + %626 = tail call float @llvm.nvvm.rsqrt.approx.f(float %590), !dbg !52 + br label %__nv_rsqrtf.exit75, !dbg !52 + +__nv_rsqrtf.exit75: ; preds = %623, %625 + %.0.i74 = phi float [ %624, %623 ], [ %626, %625 ], !dbg !52 + %627 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i76 = icmp eq i32 %627, 0, !dbg !52 + br i1 %.not.i76, label %630, label %628, !dbg !52 + +628: ; preds = %__nv_rsqrtf.exit75 + %629 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %591), !dbg !52 + br label %__nv_rsqrtf.exit78, !dbg !52 + +630: ; preds = %__nv_rsqrtf.exit75 + %631 = tail call float @llvm.nvvm.rsqrt.approx.f(float %591), !dbg !52 + br label %__nv_rsqrtf.exit78, !dbg !52 + +__nv_rsqrtf.exit78: ; preds = %628, %630 + %.0.i77 = phi float [ %629, %628 ], [ %631, %630 ], !dbg !52 + %632 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i79 = icmp eq i32 %632, 0, !dbg !52 + br i1 %.not.i79, label %635, label %633, !dbg !52 + +633: ; preds = %__nv_rsqrtf.exit78 + %634 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %592), !dbg !52 + br label %__nv_rsqrtf.exit81, !dbg !52 + +635: ; preds = %__nv_rsqrtf.exit78 + %636 = tail call float @llvm.nvvm.rsqrt.approx.f(float %592), !dbg !52 + br label %__nv_rsqrtf.exit81, !dbg !52 + +__nv_rsqrtf.exit81: ; preds = %633, %635 + %.0.i80 = phi float [ %634, %633 ], [ %636, %635 ], !dbg !52 + %637 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i82 = icmp eq i32 %637, 0, !dbg !52 + br i1 %.not.i82, label %640, label %638, !dbg !52 + +638: ; preds = %__nv_rsqrtf.exit81 + %639 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %593), !dbg !52 + br label %__nv_rsqrtf.exit84, !dbg !52 + +640: ; preds = %__nv_rsqrtf.exit81 + %641 = tail call float @llvm.nvvm.rsqrt.approx.f(float %593), !dbg !52 + br label %__nv_rsqrtf.exit84, !dbg !52 + +__nv_rsqrtf.exit84: ; preds = %638, %640 + %.0.i83 = phi float [ %639, %638 ], [ %641, %640 ], !dbg !52 + %642 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i85 = icmp eq i32 %642, 0, !dbg !52 + br i1 %.not.i85, label %645, label %643, !dbg !52 + +643: ; preds = %__nv_rsqrtf.exit84 + %644 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %594), !dbg !52 + br label %__nv_rsqrtf.exit87, !dbg !52 + +645: ; preds = %__nv_rsqrtf.exit84 + %646 = tail call float @llvm.nvvm.rsqrt.approx.f(float %594), !dbg !52 + br label %__nv_rsqrtf.exit87, !dbg !52 + +__nv_rsqrtf.exit87: ; preds = %643, %645 + %.0.i86 = phi float [ %644, %643 ], [ %646, %645 ], !dbg !52 + %647 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i88 = icmp eq i32 %647, 0, !dbg !52 + br i1 %.not.i88, label %650, label %648, !dbg !52 + +648: ; preds = %__nv_rsqrtf.exit87 + %649 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %595), !dbg !52 + br label %__nv_rsqrtf.exit90, !dbg !52 + +650: ; preds = %__nv_rsqrtf.exit87 + %651 = tail call float @llvm.nvvm.rsqrt.approx.f(float %595), !dbg !52 + br label %__nv_rsqrtf.exit90, !dbg !52 + +__nv_rsqrtf.exit90: ; preds = %648, %650 + %.0.i89 = phi float [ %649, %648 ], [ %651, %650 ], !dbg !52 + %652 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i91 = icmp eq i32 %652, 0, !dbg !52 + br i1 %.not.i91, label %655, label %653, !dbg !52 + +653: ; preds = %__nv_rsqrtf.exit90 + %654 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %596), !dbg !52 + br label %__nv_rsqrtf.exit93, !dbg !52 + +655: ; preds = %__nv_rsqrtf.exit90 + %656 = tail call float @llvm.nvvm.rsqrt.approx.f(float %596), !dbg !52 + br label %__nv_rsqrtf.exit93, !dbg !52 + +__nv_rsqrtf.exit93: ; preds = %653, %655 + %.0.i92 = phi float [ %654, %653 ], [ %656, %655 ], !dbg !52 + %657 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i94 = icmp eq i32 %657, 0, !dbg !52 + br i1 %.not.i94, label %660, label %658, !dbg !52 + +658: ; preds = %__nv_rsqrtf.exit93 + %659 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %597), !dbg !52 + br label %__nv_rsqrtf.exit96, !dbg !52 + +660: ; preds = %__nv_rsqrtf.exit93 + %661 = tail call float @llvm.nvvm.rsqrt.approx.f(float %597), !dbg !52 + br label %__nv_rsqrtf.exit96, !dbg !52 + +__nv_rsqrtf.exit96: ; preds = %658, %660 + %.0.i95 = phi float [ %659, %658 ], [ %661, %660 ], !dbg !52 + %662 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i97 = icmp eq i32 %662, 0, !dbg !52 + br i1 %.not.i97, label %665, label %663, !dbg !52 + +663: ; preds = %__nv_rsqrtf.exit96 + %664 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %598), !dbg !52 + br label %__nv_rsqrtf.exit99, !dbg !52 + +665: ; preds = %__nv_rsqrtf.exit96 + %666 = tail call float @llvm.nvvm.rsqrt.approx.f(float %598), !dbg !52 + br label %__nv_rsqrtf.exit99, !dbg !52 + +__nv_rsqrtf.exit99: ; preds = %663, %665 + %.0.i98 = phi float [ %664, %663 ], [ %666, %665 ], !dbg !52 + %667 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i100 = icmp eq i32 %667, 0, !dbg !52 + br i1 %.not.i100, label %670, label %668, !dbg !52 + +668: ; preds = %__nv_rsqrtf.exit99 + %669 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %599), !dbg !52 + br label %__nv_rsqrtf.exit102, !dbg !52 + +670: ; preds = %__nv_rsqrtf.exit99 + %671 = tail call float @llvm.nvvm.rsqrt.approx.f(float %599), !dbg !52 + br label %__nv_rsqrtf.exit102, !dbg !52 + +__nv_rsqrtf.exit102: ; preds = %668, %670 + %.0.i101 = phi float [ %669, %668 ], [ %671, %670 ], !dbg !52 + %672 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i103 = icmp eq i32 %672, 0, !dbg !52 + br i1 %.not.i103, label %675, label %673, !dbg !52 + +673: ; preds = %__nv_rsqrtf.exit102 + %674 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %600), !dbg !52 + br label %__nv_rsqrtf.exit105, !dbg !52 + +675: ; preds = %__nv_rsqrtf.exit102 + %676 = tail call float @llvm.nvvm.rsqrt.approx.f(float %600), !dbg !52 + br label %__nv_rsqrtf.exit105, !dbg !52 + +__nv_rsqrtf.exit105: ; preds = %673, %675 + %.0.i104 = phi float [ %674, %673 ], [ %676, %675 ], !dbg !52 + %677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52 + %.not.i106 = icmp eq i32 %677, 0, !dbg !52 + br i1 %.not.i106, label %680, label %678, !dbg !52 + +678: ; preds = %__nv_rsqrtf.exit105 + %679 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %601), !dbg !52 + br label %__nv_rsqrtf.exit108, !dbg !52 + +680: ; preds = %__nv_rsqrtf.exit105 + %681 = tail call float @llvm.nvvm.rsqrt.approx.f(float %601), !dbg !52 + br label %__nv_rsqrtf.exit108, !dbg !52 + +__nv_rsqrtf.exit108: ; preds = %678, %680 + %.0.i107 = phi float [ %679, %678 ], [ %681, %680 ], !dbg !52 + %682 = extractvalue { i32, i32, i32, i32 } %524, 3, !dbg !45 + %683 = bitcast i32 %682 to <2 x bfloat>, !dbg !45 + %684 = extractelement <2 x bfloat> %683, i64 1, !dbg !45 + %685 = fpext bfloat %684 to float, !dbg !45 + %686 = extractelement <2 x bfloat> %683, i64 0, !dbg !45 + %687 = fpext bfloat %686 to float, !dbg !45 + %688 = extractvalue { i32, i32, i32, i32 } %524, 2, !dbg !45 + %689 = bitcast i32 %688 to <2 x bfloat>, !dbg !45 + %690 = extractelement <2 x bfloat> %689, i64 1, !dbg !45 + %691 = fpext bfloat %690 to float, !dbg !45 + %692 = extractelement <2 x bfloat> %689, i64 0, !dbg !45 + %693 = fpext bfloat %692 to float, !dbg !45 + %694 = extractvalue { i32, i32, i32, i32 } %524, 1, !dbg !45 + %695 = bitcast i32 %694 to <2 x bfloat>, !dbg !45 + %696 = extractelement <2 x bfloat> %695, i64 1, !dbg !45 + %697 = fpext bfloat %696 to float, !dbg !45 + %698 = extractelement <2 x bfloat> %695, i64 0, !dbg !45 + %699 = fpext bfloat %698 to float, !dbg !45 + %700 = extractvalue { i32, i32, i32, i32 } %524, 0, !dbg !45 + %701 = bitcast i32 %700 to <2 x bfloat>, !dbg !45 + %702 = extractelement <2 x bfloat> %701, i64 1, !dbg !45 + %703 = fpext bfloat %702 to float, !dbg !45 + %704 = extractelement <2 x bfloat> %701, i64 0, !dbg !45 + %705 = fpext bfloat %704 to float, !dbg !45 + %706 = extractvalue { i32, i32, i32, i32 } %523, 3, !dbg !45 + %707 = bitcast i32 %706 to <2 x bfloat>, !dbg !45 + %708 = extractelement <2 x bfloat> %707, i64 1, !dbg !45 + %709 = fpext bfloat %708 to float, !dbg !45 + %710 = extractelement <2 x bfloat> %707, i64 0, !dbg !45 + %711 = fpext bfloat %710 to float, !dbg !45 + %712 = extractvalue { i32, i32, i32, i32 } %523, 2, !dbg !45 + %713 = bitcast i32 %712 to <2 x bfloat>, !dbg !45 + %714 = extractelement <2 x bfloat> %713, i64 1, !dbg !45 + %715 = fpext bfloat %714 to float, !dbg !45 + %716 = extractelement <2 x bfloat> %713, i64 0, !dbg !45 + %717 = fpext bfloat %716 to float, !dbg !45 + %718 = extractvalue { i32, i32, i32, i32 } %523, 1, !dbg !45 + %719 = bitcast i32 %718 to <2 x bfloat>, !dbg !45 + %720 = extractelement <2 x bfloat> %719, i64 1, !dbg !45 + %721 = fpext bfloat %720 to float, !dbg !45 + %722 = extractelement <2 x bfloat> %719, i64 0, !dbg !45 + %723 = fpext bfloat %722 to float, !dbg !45 + %724 = extractvalue { i32, i32, i32, i32 } %523, 0, !dbg !45 + %725 = bitcast i32 %724 to <2 x bfloat>, !dbg !45 + %726 = extractelement <2 x bfloat> %725, i64 1, !dbg !45 + %727 = fpext bfloat %726 to float, !dbg !45 + %728 = extractelement <2 x bfloat> %725, i64 0, !dbg !45 + %729 = fpext bfloat %728 to float, !dbg !45 + %730 = extractvalue { i32, i32, i32, i32 } %451, 3, !dbg !39 + %731 = bitcast i32 %730 to <2 x bfloat>, !dbg !39 + %732 = extractvalue { i32, i32, i32, i32 } %451, 2, !dbg !39 + %733 = bitcast i32 %732 to <2 x bfloat>, !dbg !39 + %734 = extractvalue { i32, i32, i32, i32 } %451, 1, !dbg !39 + %735 = bitcast i32 %734 to <2 x bfloat>, !dbg !39 + %736 = extractvalue { i32, i32, i32, i32 } %451, 0, !dbg !39 + %737 = bitcast i32 %736 to <2 x bfloat>, !dbg !39 + %738 = extractvalue { i32, i32, i32, i32 } %449, 3, !dbg !39 + %739 = bitcast i32 %738 to <2 x bfloat>, !dbg !39 + %740 = extractvalue { i32, i32, i32, i32 } %449, 2, !dbg !39 + %741 = bitcast i32 %740 to <2 x bfloat>, !dbg !39 + %742 = extractvalue { i32, i32, i32, i32 } %449, 1, !dbg !39 + %743 = bitcast i32 %742 to <2 x bfloat>, !dbg !39 + %744 = extractvalue { i32, i32, i32, i32 } %449, 0, !dbg !39 + %745 = bitcast i32 %744 to <2 x bfloat>, !dbg !39 + %746 = icmp slt i32 %23, 73728, !dbg !53 + %747 = fmul float %.0.i62, %729, !dbg !54 + %748 = fmul float %.0.i65, %727, !dbg !54 + %749 = fmul float %.0.i68, %723, !dbg !54 + %750 = fmul float %.0.i71, %721, !dbg !54 + %751 = fmul float %.0.i74, %717, !dbg !54 + %752 = fmul float %.0.i77, %715, !dbg !54 + %753 = fmul float %.0.i80, %711, !dbg !54 + %754 = fmul float %.0.i83, %709, !dbg !54 + %755 = fmul float %.0.i86, %705, !dbg !54 + %756 = fmul float %.0.i89, %703, !dbg !54 + %757 = fmul float %.0.i92, %699, !dbg !54 + %758 = fmul float %.0.i95, %697, !dbg !54 + %759 = fmul float %.0.i98, %693, !dbg !54 + %760 = fmul float %.0.i101, %691, !dbg !54 + %761 = fmul float %.0.i104, %687, !dbg !54 + %762 = fmul float %.0.i107, %685, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + store float %747, ptr addrspace(3) %372, align 4, !dbg !54 + store float %749, ptr addrspace(3) %373, align 4, !dbg !54 + store float %748, ptr addrspace(3) %375, align 4, !dbg !54 + store float %750, ptr addrspace(3) %376, align 4, !dbg !54 + store float %751, ptr addrspace(3) %378, align 4, !dbg !54 + store float %753, ptr addrspace(3) %379, align 4, !dbg !54 + store float %752, ptr addrspace(3) %381, align 4, !dbg !54 + store float %754, ptr addrspace(3) %382, align 4, !dbg !54 + store float %755, ptr addrspace(3) %384, align 4, !dbg !54 + store float %757, ptr addrspace(3) %385, align 4, !dbg !54 + store float %756, ptr addrspace(3) %387, align 4, !dbg !54 + store float %758, ptr addrspace(3) %388, align 4, !dbg !54 + store float %759, ptr addrspace(3) %390, align 4, !dbg !54 + store float %761, ptr addrspace(3) %391, align 4, !dbg !54 + store float %760, ptr addrspace(3) %393, align 4, !dbg !54 + store float %762, ptr addrspace(3) %394, align 4, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %763 = load float, ptr addrspace(3) %407, align 4, !dbg !54 + %764 = load float, ptr addrspace(3) %409, align 4, !dbg !54 + %765 = load float, ptr addrspace(3) %412, align 4, !dbg !54 + %766 = load float, ptr addrspace(3) %414, align 4, !dbg !54 + %767 = load float, ptr addrspace(3) %417, align 4, !dbg !54 + %768 = load float, ptr addrspace(3) %419, align 4, !dbg !54 + %769 = load float, ptr addrspace(3) %422, align 4, !dbg !54 + %770 = load float, ptr addrspace(3) %424, align 4, !dbg !54 + %771 = load float, ptr addrspace(3) %427, align 4, !dbg !54 + %772 = load float, ptr addrspace(3) %429, align 4, !dbg !54 + %773 = load float, ptr addrspace(3) %432, align 4, !dbg !54 + %774 = load float, ptr addrspace(3) %434, align 4, !dbg !54 + %775 = load float, ptr addrspace(3) %437, align 4, !dbg !54 + %776 = load float, ptr addrspace(3) %439, align 4, !dbg !54 + %777 = load float, ptr addrspace(3) %442, align 4, !dbg !54 + %778 = load float, ptr addrspace(3) %444, align 4, !dbg !54 + %779 = getelementptr bfloat, ptr addrspace(1) %5, i64 %446, !dbg !55 + %780 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !56 + %781 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %779, i64 %780, i1 %461) #6, !dbg !56 + %782 = extractvalue { i32, i32, i32, i32 } %781, 0, !dbg !56 + %783 = bitcast i32 %782 to <2 x bfloat>, !dbg !56 + %784 = extractvalue { i32, i32, i32, i32 } %781, 1, !dbg !56 + %785 = bitcast i32 %784 to <2 x bfloat>, !dbg !56 + %786 = extractvalue { i32, i32, i32, i32 } %781, 2, !dbg !56 + %787 = bitcast i32 %786 to <2 x bfloat>, !dbg !56 + %788 = extractvalue { i32, i32, i32, i32 } %781, 3, !dbg !56 + %789 = bitcast i32 %788 to <2 x bfloat>, !dbg !56 + %790 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !56 + %791 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %779, i64 %790, i1 %461) #6, !dbg !56 + %792 = extractvalue { i32, i32, i32, i32 } %791, 0, !dbg !56 + %793 = bitcast i32 %792 to <2 x bfloat>, !dbg !56 + %794 = extractvalue { i32, i32, i32, i32 } %791, 1, !dbg !56 + %795 = bitcast i32 %794 to <2 x bfloat>, !dbg !56 + %796 = extractvalue { i32, i32, i32, i32 } %791, 2, !dbg !56 + %797 = bitcast i32 %796 to <2 x bfloat>, !dbg !56 + %798 = extractvalue { i32, i32, i32, i32 } %791, 3, !dbg !56 + %799 = bitcast i32 %798 to <2 x bfloat>, !dbg !56 + %800 = shl i32 %23, 7, !dbg !57 + %801 = shl i32 %24, 7, !dbg !57 + %802 = add i32 %800, %32, !dbg !58 + %803 = add i32 %801, %32, !dbg !58 + %804 = sext i32 %802 to i64, !dbg !59 + %805 = getelementptr bfloat, ptr addrspace(1) %6, i64 %804, !dbg !59 + %806 = sext i32 %803 to i64, !dbg !59 + %807 = getelementptr bfloat, ptr addrspace(1) %6, i64 %806, !dbg !59 + %808 = and i1 %34, %746, !dbg !60 + %809 = fpext <2 x bfloat> %745 to <2 x float>, !dbg !61 + %810 = insertelement <2 x float> poison, float %408, i64 0, !dbg !62 + %811 = insertelement <2 x float> %810, float %413, i64 1, !dbg !62 + %812 = fmul <2 x float> %811, %809, !dbg !62 + %813 = fpext <2 x bfloat> %783 to <2 x float>, !dbg !63 + %814 = insertelement <2 x float> poison, float %763, i64 0, !dbg !64 + %815 = insertelement <2 x float> %814, float %765, i64 1, !dbg !64 + %816 = fmul <2 x float> %815, %813, !dbg !64 + %817 = insertelement <2 x i1> poison, i1 %41, i64 0, !dbg !65 + %818 = shufflevector <2 x i1> %817, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !65 + %819 = select <2 x i1> %818, <2 x float> %812, <2 x float> %816, !dbg !65 + %820 = fptrunc <2 x float> %819 to <2 x bfloat>, !dbg !66 + %821 = fpext <2 x bfloat> %743 to <2 x float>, !dbg !61 + %822 = insertelement <2 x float> poison, float %418, i64 0, !dbg !62 + %823 = insertelement <2 x float> %822, float %423, i64 1, !dbg !62 + %824 = fmul <2 x float> %823, %821, !dbg !62 + %825 = fpext <2 x bfloat> %785 to <2 x float>, !dbg !63 + %826 = insertelement <2 x float> poison, float %767, i64 0, !dbg !64 + %827 = insertelement <2 x float> %826, float %769, i64 1, !dbg !64 + %828 = fmul <2 x float> %827, %825, !dbg !64 + %829 = select <2 x i1> %818, <2 x float> %824, <2 x float> %828, !dbg !65 + %830 = fptrunc <2 x float> %829 to <2 x bfloat>, !dbg !66 + %831 = fpext <2 x bfloat> %741 to <2 x float>, !dbg !61 + %832 = insertelement <2 x float> poison, float %410, i64 0, !dbg !62 + %833 = insertelement <2 x float> %832, float %415, i64 1, !dbg !62 + %834 = fmul <2 x float> %833, %831, !dbg !62 + %835 = fpext <2 x bfloat> %787 to <2 x float>, !dbg !63 + %836 = insertelement <2 x float> poison, float %764, i64 0, !dbg !64 + %837 = insertelement <2 x float> %836, float %766, i64 1, !dbg !64 + %838 = fmul <2 x float> %837, %835, !dbg !64 + %839 = select <2 x i1> %818, <2 x float> %834, <2 x float> %838, !dbg !65 + %840 = fptrunc <2 x float> %839 to <2 x bfloat>, !dbg !66 + %841 = fpext <2 x bfloat> %739 to <2 x float>, !dbg !61 + %842 = insertelement <2 x float> poison, float %420, i64 0, !dbg !62 + %843 = insertelement <2 x float> %842, float %425, i64 1, !dbg !62 + %844 = fmul <2 x float> %843, %841, !dbg !62 + %845 = fpext <2 x bfloat> %789 to <2 x float>, !dbg !63 + %846 = insertelement <2 x float> poison, float %768, i64 0, !dbg !64 + %847 = insertelement <2 x float> %846, float %770, i64 1, !dbg !64 + %848 = fmul <2 x float> %847, %845, !dbg !64 + %849 = select <2 x i1> %818, <2 x float> %844, <2 x float> %848, !dbg !65 + %850 = fptrunc <2 x float> %849 to <2 x bfloat>, !dbg !66 + %851 = fpext <2 x bfloat> %737 to <2 x float>, !dbg !61 + %852 = insertelement <2 x float> poison, float %428, i64 0, !dbg !62 + %853 = insertelement <2 x float> %852, float %433, i64 1, !dbg !62 + %854 = fmul <2 x float> %853, %851, !dbg !62 + %855 = fpext <2 x bfloat> %793 to <2 x float>, !dbg !63 + %856 = insertelement <2 x float> poison, float %771, i64 0, !dbg !64 + %857 = insertelement <2 x float> %856, float %773, i64 1, !dbg !64 + %858 = fmul <2 x float> %857, %855, !dbg !64 + %859 = select <2 x i1> %818, <2 x float> %854, <2 x float> %858, !dbg !65 + %860 = fptrunc <2 x float> %859 to <2 x bfloat>, !dbg !66 + %861 = fpext <2 x bfloat> %735 to <2 x float>, !dbg !61 + %862 = insertelement <2 x float> poison, float %438, i64 0, !dbg !62 + %863 = insertelement <2 x float> %862, float %443, i64 1, !dbg !62 + %864 = fmul <2 x float> %863, %861, !dbg !62 + %865 = fpext <2 x bfloat> %795 to <2 x float>, !dbg !63 + %866 = insertelement <2 x float> poison, float %775, i64 0, !dbg !64 + %867 = insertelement <2 x float> %866, float %777, i64 1, !dbg !64 + %868 = fmul <2 x float> %867, %865, !dbg !64 + %869 = select <2 x i1> %818, <2 x float> %864, <2 x float> %868, !dbg !65 + %870 = fptrunc <2 x float> %869 to <2 x bfloat>, !dbg !66 + %871 = fpext <2 x bfloat> %733 to <2 x float>, !dbg !61 + %872 = insertelement <2 x float> poison, float %430, i64 0, !dbg !62 + %873 = insertelement <2 x float> %872, float %435, i64 1, !dbg !62 + %874 = fmul <2 x float> %873, %871, !dbg !62 + %875 = fpext <2 x bfloat> %797 to <2 x float>, !dbg !63 + %876 = insertelement <2 x float> poison, float %772, i64 0, !dbg !64 + %877 = insertelement <2 x float> %876, float %774, i64 1, !dbg !64 + %878 = fmul <2 x float> %877, %875, !dbg !64 + %879 = select <2 x i1> %818, <2 x float> %874, <2 x float> %878, !dbg !65 + %880 = fptrunc <2 x float> %879 to <2 x bfloat>, !dbg !66 + %881 = fpext <2 x bfloat> %731 to <2 x float>, !dbg !61 + %882 = insertelement <2 x float> poison, float %440, i64 0, !dbg !62 + %883 = insertelement <2 x float> %882, float %445, i64 1, !dbg !62 + %884 = fmul <2 x float> %883, %881, !dbg !62 + %885 = fpext <2 x bfloat> %799 to <2 x float>, !dbg !63 + %886 = insertelement <2 x float> poison, float %776, i64 0, !dbg !64 + %887 = insertelement <2 x float> %886, float %778, i64 1, !dbg !64 + %888 = fmul <2 x float> %887, %885, !dbg !64 + %889 = select <2 x i1> %818, <2 x float> %884, <2 x float> %888, !dbg !65 + %890 = fptrunc <2 x float> %889 to <2 x bfloat>, !dbg !66 + %891 = bitcast <2 x bfloat> %820 to i32, !dbg !66 + %892 = bitcast <2 x bfloat> %830 to i32, !dbg !66 + %893 = bitcast <2 x bfloat> %840 to i32, !dbg !66 + %894 = bitcast <2 x bfloat> %850 to i32, !dbg !66 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %891, i32 %892, i32 %893, i32 %894, ptr addrspace(1) %805, i1 %808) #6, !dbg !66 + %895 = bitcast <2 x bfloat> %860 to i32, !dbg !66 + %896 = bitcast <2 x bfloat> %870 to i32, !dbg !66 + %897 = bitcast <2 x bfloat> %880 to i32, !dbg !66 + %898 = bitcast <2 x bfloat> %890 to i32, !dbg !66 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %895, i32 %896, i32 %897, i32 %898, ptr addrspace(1) %807, i1 %808) #6, !dbg !66 + ret void, !dbg !67 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: nocallback nofree nounwind memory(argmem: read) +declare { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) readonly captures(none)) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nocallback nofree nounwind memory(argmem: read) } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 29, scope: !5) +!9 = !DILocation(line: 21, column: 48, scope: !5) +!10 = !DILocation(line: 21, column: 69, scope: !5) +!11 = !DILocation(line: 21, column: 53, scope: !5) +!12 = !DILocation(line: 21, column: 34, scope: !5) +!13 = !DILocation(line: 21, column: 75, scope: !5) +!14 = !DILocation(line: 22, column: 44, scope: !5) +!15 = !DILocation(line: 22, column: 23, scope: !5) +!16 = !DILocation(line: 24, column: 28, scope: !5) +!17 = !DILocation(line: 24, column: 33, scope: !5) +!18 = !DILocation(line: 25, column: 44, scope: !5) +!19 = !DILocation(line: 25, column: 23, scope: !5) +!20 = !DILocation(line: 26, column: 21, scope: !5) +!21 = !DILocation(line: 27, column: 19, scope: !5) +!22 = !DILocation(line: 29, column: 19, scope: !5) +!23 = !DILocation(line: 35, column: 18, scope: !5) +!24 = !DILocation(line: 36, column: 39, scope: !5) +!25 = !DILocation(line: 36, column: 35, scope: !5) +!26 = !DILocation(line: 36, column: 51, scope: !5) +!27 = !DILocation(line: 36, column: 44, scope: !5) +!28 = !DILocation(line: 36, column: 30, scope: !5) +!29 = !DILocation(line: 36, column: 64, scope: !5) +!30 = !DILocation(line: 36, column: 57, scope: !5) +!31 = !DILocation(line: 36, column: 123, scope: !5) +!32 = !DILocation(line: 38, column: 30, scope: !5) +!33 = !DILocation(line: 38, column: 80, scope: !5) +!34 = !DILocation(line: 40, column: 19, scope: !5) +!35 = !DILocation(line: 42, column: 19, scope: !5) +!36 = !DILocation(line: 43, column: 28, scope: !5) +!37 = !DILocation(line: 44, column: 19, scope: !5) +!38 = !DILocation(line: 45, column: 31, scope: !5) +!39 = !DILocation(line: 45, column: 71, scope: !5) +!40 = !DILocation(line: 54, column: 52, scope: !5) +!41 = !DILocation(line: 54, column: 45, scope: !5) +!42 = !DILocation(line: 54, column: 31, scope: !5) +!43 = !DILocation(line: 54, column: 83, scope: !5) +!44 = !DILocation(line: 54, column: 67, scope: !5) +!45 = !DILocation(line: 54, column: 134, scope: !5) +!46 = !DILocation(line: 56, column: 56, scope: !5) +!47 = !DILocation(line: 56, column: 52, scope: !5) +!48 = !DILocation(line: 56, column: 31, scope: !5) +!49 = !DILocation(line: 56, column: 90, scope: !5) +!50 = !DILocation(line: 58, column: 21, scope: !5) +!51 = !DILocation(line: 60, column: 20, scope: !5) +!52 = !DILocation(line: 61, column: 28, scope: !5) +!53 = !DILocation(line: 23, column: 21, scope: !5) +!54 = !DILocation(line: 62, column: 20, scope: !5) +!55 = !DILocation(line: 63, column: 31, scope: !5) +!56 = !DILocation(line: 63, column: 71, scope: !5) +!57 = !DILocation(line: 70, column: 34, scope: !5) +!58 = !DILocation(line: 70, column: 30, scope: !5) +!59 = !DILocation(line: 70, column: 25, scope: !5) +!60 = !DILocation(line: 70, column: 54, scope: !5) +!61 = !DILocation(line: 45, column: 137, scope: !5) +!62 = !DILocation(line: 47, column: 20, scope: !5) +!63 = !DILocation(line: 63, column: 138, scope: !5) +!64 = !DILocation(line: 65, column: 20, scope: !5) +!65 = !DILocation(line: 0, scope: !5) +!66 = !DILocation(line: 70, column: 46, scope: !5) +!67 = !DILocation(line: 70, column: 4, scope: !5) diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ca658b515880c2ca77001f4f0b40acdb60efe741 --- /dev/null +++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx @@ -0,0 +1,1096 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_poi_fused__fused_rms_norm_cat_view_2 +.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2( + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7, + .param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10 +) +.reqntid 256 +{ + .reg .pred %p<12>; + .reg .b16 %rs<65>; + .reg .b32 %r<499>; + .reg .b64 %rd<35>; + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd27, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0]; + ld.param.b64 %rd28, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1]; +$L__tmp0: + .loc 1 21 29 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29 + mov.u32 %r74, %ctaid.y; + ld.param.b64 %rd29, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2]; + .loc 1 21 48 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48 + mov.u32 %r75, %ctaid.z; + ld.param.b64 %rd30, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3]; + .loc 1 21 69 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69 + mov.u32 %r76, %nctaid.y; + ld.param.b64 %rd31, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4]; + .loc 1 21 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34 + mad.lo.s32 %r77, %r75, %r76, %r74; + ld.param.b64 %rd32, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5]; + .loc 1 21 75 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75 + shl.b32 %r78, %r77, 5; + ld.param.b64 %rd33, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6]; + .loc 1 22 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44 + mov.u32 %r79, %tid.x; + bfe.u32 %r80, %r79, 4, 4; + and.b32 %r81, %r79, 7; + shl.b32 %r82, %r81, 2; + .loc 1 22 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23 + or.b32 %r83, %r78, %r80; + or.b32 %r84, %r83, 16; + or.b32 %r85, %r78, %r82; + .loc 1 24 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28 + mov.u32 %r86, %ctaid.x; + .loc 1 24 33 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:33 + shl.b32 %r87, %r86, 7; + .loc 1 25 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:44 + and.b32 %r88, %r79, 15; + shl.b32 %r89, %r88, 3; + bfe.u32 %r90, %r79, 3, 5; + .loc 1 25 23 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:23 + or.b32 %r91, %r89, %r87; + or.b32 %r92, %r90, %r87; + .loc 1 26 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21 + setp.lt.s32 %p6, %r91, 128; + setp.lt.s32 %p7, %r92, 128; + .loc 1 27 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19 + bfe.s32 %r93, %r77, 26, 1; + .loc 1 29 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19 + shr.u32 %r94, %r93, 27; + .loc 1 27 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19 + add.s32 %r95, %r83, %r94; + shr.u32 %r96, %r95, 5; + .loc 1 29 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19 + and.b32 %r97, %r95, 33554400; + sub.s32 %r98, %r83, %r97; + add.s32 %r99, %r84, %r94; + and.b32 %r100, %r99, 33554400; + sub.s32 %r101, %r84, %r100; + .loc 1 35 18 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18 + setp.lt.s32 %p8, %r83, 8192; + setp.lt.s32 %p9, %r85, 8192; + .loc 1 36 39 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39 + shl.b32 %r102, %r98, 7; + shl.b32 %r103, %r101, 7; + .loc 1 36 35 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35 + add.s32 %r104, %r102, %r91; + add.s32 %r105, %r103, %r91; + .loc 1 36 51 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:51 + mul.lo.s32 %r106, %r96, 12288; + .loc 1 36 44 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44 + add.s32 %r107, %r104, %r106; + add.s32 %r108, %r105, %r106; + .loc 1 36 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30 + mad.wide.s32 %rd1, %r107, 2, %rd27; + mad.wide.s32 %rd3, %r108, 2, %rd27; + .loc 1 36 64 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64 + and.pred %p1, %p6, %p8; + and.pred %p2, %p7, %p9; + .loc 1 36 57 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r5, 0; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2; + // end inline asm + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4; + // end inline asm + prmt.b32 %r109, %r1, %r6, 0x7632U; + prmt.b32 %r110, %r2, %r7, 0x7632U; + prmt.b32 %r111, %r3, %r8, 0x7632U; + prmt.b32 %r112, %r4, %r9, 0x7632U; + .loc 1 36 123 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123 + and.b32 %r113, %r79, 192; + shl.b32 %r114, %r113, 5; + shl.b32 %r115, %r81, 4; + shr.u32 %r116, %r113, 1; + shl.b32 %r117, %r79, 6; + and.b32 %r118, %r117, 512; + and.b32 %r119, %r79, 16; + bfe.s32 %r120, %r79, 4, 1; + and.b32 %r121, %r120, 1040; + and.b32 %r122, %r79, 32; + shl.b32 %r123, %r122, 2; + or.b32 %r124, %r114, %r115; + or.b32 %r125, %r121, %r116; + xor.b32 %r126, %r125, %r124; + mov.b32 %r127, global_smem; + add.s32 %r128, %r127, %r118; + add.s32 %r129, %r128, %r126; + add.s32 %r130, %r129, %r123; + prmt.b32 %r131, %r1, %r6, 0x5410U; + prmt.b32 %r132, %r2, %r7, 0x5410U; + st.shared.v4.b32 [%r130], {%r131, %r109, %r132, %r110}; + prmt.b32 %r133, %r3, %r8, 0x5410U; + prmt.b32 %r134, %r4, %r9, 0x5410U; + st.shared.v4.b32 [%r130+256], {%r133, %r111, %r134, %r112}; + bar.sync 0; + shl.b32 %r135, %r81, 10; + shl.b32 %r136, %r88, 4; + shr.u32 %r137, %r113, 2; + shl.b32 %r138, %r119, 2; + shl.b32 %r139, %r122, 3; + xor.b32 %r140, %r136, %r137; + xor.b32 %r141, %r140, %r138; + add.s32 %r142, %r127, %r135; + add.s32 %r143, %r142, %r141; + add.s32 %r144, %r143, %r139; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r145, %r146, %r147, %r148}, [%r144]; + mov.b32 {%rs1, %rs2}, %r145; + mov.b32 {%rs3, %rs4}, %r146; + mov.b32 {%rs5, %rs6}, %r147; + mov.b32 {%rs7, %rs8}, %r148; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r149, %r150, %r151, %r152}, [%r144+512]; + mov.b32 {%rs9, %rs10}, %r149; + mov.b32 {%rs11, %rs12}, %r150; + mov.b32 {%rs13, %rs14}, %r151; + mov.b32 {%rs15, %rs16}, %r152; + cvt.f32.bf16 %r153, %rs1; + cvt.f32.bf16 %r154, %rs2; + cvt.f32.bf16 %r155, %rs3; + cvt.f32.bf16 %r156, %rs4; + cvt.f32.bf16 %r157, %rs5; + cvt.f32.bf16 %r158, %rs6; + cvt.f32.bf16 %r159, %rs7; + cvt.f32.bf16 %r160, %rs8; + cvt.f32.bf16 %r161, %rs9; + cvt.f32.bf16 %r162, %rs10; + cvt.f32.bf16 %r163, %rs11; + cvt.f32.bf16 %r164, %rs12; + cvt.f32.bf16 %r165, %rs13; + cvt.f32.bf16 %r166, %rs14; + cvt.f32.bf16 %r167, %rs15; + cvt.f32.bf16 %r168, %rs16; + .loc 1 38 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30 + mad.wide.s32 %rd5, %r85, 4, %rd28; + .loc 1 38 80 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + mov.u32 %r13, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6; + // end inline asm + // begin inline asm + mov.u64 %rd7, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r14, %r5; + mov.u32 %r15, %r5; + mov.u32 %r16, %r5; + mov.u32 %r17, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd5 + 0 ], %rd7; + // end inline asm + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r18, %r5; + mov.u32 %r19, %r5; + mov.u32 %r20, %r5; + mov.u32 %r21, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd5 + 0 ], %rd8; + // end inline asm + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r22, %r5; + mov.u32 %r23, %r5; + mov.u32 %r24, %r5; + mov.u32 %r25, %r5; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r22, %r23, %r24, %r25 }, [ %rd5 + 0 ], %rd9; + // end inline asm + mov.b32 %r169, 0f43000000; + .loc 1 40 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19 + div.full.f32 %r170, %r10, %r169; + div.full.f32 %r171, %r11, %r169; + div.full.f32 %r172, %r12, %r169; + div.full.f32 %r173, %r13, %r169; + div.full.f32 %r174, %r14, %r169; + div.full.f32 %r175, %r15, %r169; + div.full.f32 %r176, %r16, %r169; + div.full.f32 %r177, %r17, %r169; + div.full.f32 %r178, %r18, %r169; + div.full.f32 %r179, %r19, %r169; + div.full.f32 %r180, %r20, %r169; + div.full.f32 %r181, %r21, %r169; + div.full.f32 %r182, %r22, %r169; + div.full.f32 %r183, %r23, %r169; + div.full.f32 %r184, %r24, %r169; + div.full.f32 %r185, %r25, %r169; + .loc 1 42 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19 + add.f32 %r186, %r170, 0f358637BD; + add.f32 %r187, %r171, 0f358637BD; + add.f32 %r188, %r172, 0f358637BD; + add.f32 %r189, %r173, 0f358637BD; + add.f32 %r190, %r174, 0f358637BD; + add.f32 %r191, %r175, 0f358637BD; + add.f32 %r192, %r176, 0f358637BD; + add.f32 %r193, %r177, 0f358637BD; + add.f32 %r194, %r178, 0f358637BD; + add.f32 %r195, %r179, 0f358637BD; + add.f32 %r196, %r180, 0f358637BD; + add.f32 %r197, %r181, 0f358637BD; + add.f32 %r198, %r182, 0f358637BD; + add.f32 %r199, %r183, 0f358637BD; + add.f32 %r200, %r184, 0f358637BD; + add.f32 %r201, %r185, 0f358637BD; + .loc 1 43 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28 + rsqrt.approx.ftz.f32 %r202, %r186; + rsqrt.approx.ftz.f32 %r203, %r187; + rsqrt.approx.ftz.f32 %r204, %r188; + rsqrt.approx.ftz.f32 %r205, %r189; + rsqrt.approx.ftz.f32 %r206, %r190; + rsqrt.approx.ftz.f32 %r207, %r191; + rsqrt.approx.ftz.f32 %r208, %r192; + rsqrt.approx.ftz.f32 %r209, %r193; + rsqrt.approx.ftz.f32 %r210, %r194; + rsqrt.approx.ftz.f32 %r211, %r195; + rsqrt.approx.ftz.f32 %r212, %r196; + rsqrt.approx.ftz.f32 %r213, %r197; + rsqrt.approx.ftz.f32 %r214, %r198; + rsqrt.approx.ftz.f32 %r215, %r199; + rsqrt.approx.ftz.f32 %r216, %r200; + rsqrt.approx.ftz.f32 %r217, %r201; + .loc 1 44 19 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19 + mul.f32 %r218, %r202, %r153; + mul.f32 %r219, %r203, %r154; + mul.f32 %r220, %r204, %r155; + mul.f32 %r221, %r205, %r156; + mul.f32 %r222, %r206, %r157; + mul.f32 %r223, %r207, %r158; + mul.f32 %r224, %r208, %r159; + mul.f32 %r225, %r209, %r160; + mul.f32 %r226, %r210, %r161; + mul.f32 %r227, %r211, %r162; + mul.f32 %r228, %r212, %r163; + mul.f32 %r229, %r213, %r164; + mul.f32 %r230, %r214, %r165; + mul.f32 %r231, %r215, %r166; + mul.f32 %r232, %r216, %r167; + mul.f32 %r233, %r217, %r168; + bar.sync 0; + shl.b32 %r234, %r79, 9; + and.b32 %r235, %r234, 15360; + shr.u32 %r236, %r79, 1; + and.b32 %r237, %r236, 108; + or.b32 %r238, %r235, %r115; + xor.b32 %r239, %r238, %r237; + or.b32 %r240, %r239, %r139; + add.s32 %r241, %r127, %r240; + st.shared.b32 [%r241], %r218; + st.shared.b32 [%r241+128], %r220; + xor.b32 %r242, %r240, 16; + add.s32 %r243, %r127, %r242; + st.shared.b32 [%r243+512], %r219; + st.shared.b32 [%r243+640], %r221; + xor.b32 %r244, %r240, 4; + add.s32 %r245, %r127, %r244; + st.shared.b32 [%r245], %r222; + st.shared.b32 [%r245+128], %r224; + xor.b32 %r246, %r240, 20; + add.s32 %r247, %r127, %r246; + st.shared.b32 [%r247+512], %r223; + st.shared.b32 [%r247+640], %r225; + xor.b32 %r248, %r240, 8; + add.s32 %r249, %r127, %r248; + st.shared.b32 [%r249], %r226; + st.shared.b32 [%r249+128], %r228; + xor.b32 %r250, %r240, 24; + add.s32 %r251, %r127, %r250; + st.shared.b32 [%r251+512], %r227; + st.shared.b32 [%r251+640], %r229; + xor.b32 %r252, %r240, 12; + add.s32 %r253, %r127, %r252; + st.shared.b32 [%r253], %r230; + st.shared.b32 [%r253+128], %r232; + xor.b32 %r254, %r240, 28; + add.s32 %r255, %r127, %r254; + st.shared.b32 [%r255+512], %r231; + st.shared.b32 [%r255+640], %r233; + bar.sync 0; + shl.b32 %r256, %r79, 5; + and.b32 %r257, %r256, 608; + and.b32 %r258, %r79, 28; + shr.u32 %r259, %r79, 2; + and.b32 %r260, %r259, 16; + bfe.s32 %r261, %r79, 7, 1; + and.b32 %r262, %r261, 1056; + or.b32 %r263, %r257, %r258; + or.b32 %r264, %r262, %r260; + xor.b32 %r265, %r264, %r263; + or.b32 %r266, %r265, %r123; + add.s32 %r267, %r127, %r266; + ld.shared.b32 %r268, [%r267]; + ld.shared.b32 %r269, [%r267+256]; + xor.b32 %r270, %r266, 4; + add.s32 %r271, %r127, %r270; + ld.shared.b32 %r272, [%r271+4096]; + ld.shared.b32 %r273, [%r271+4352]; + xor.b32 %r274, %r266, 8; + add.s32 %r275, %r127, %r274; + ld.shared.b32 %r276, [%r275+8192]; + ld.shared.b32 %r277, [%r275+8448]; + xor.b32 %r278, %r266, 12; + add.s32 %r279, %r127, %r278; + ld.shared.b32 %r280, [%r279+12288]; + ld.shared.b32 %r281, [%r279+12544]; + xor.b32 %r282, %r266, 64; + add.s32 %r283, %r127, %r282; + ld.shared.b32 %r284, [%r283+2048]; + ld.shared.b32 %r285, [%r283+2304]; + xor.b32 %r286, %r266, 68; + add.s32 %r287, %r127, %r286; + ld.shared.b32 %r288, [%r287+6144]; + ld.shared.b32 %r289, [%r287+6400]; + xor.b32 %r290, %r266, 72; + add.s32 %r291, %r127, %r290; + ld.shared.b32 %r292, [%r291+10240]; + ld.shared.b32 %r293, [%r291+10496]; + xor.b32 %r294, %r266, 76; + add.s32 %r295, %r127, %r294; + ld.shared.b32 %r296, [%r295+14336]; + ld.shared.b32 %r297, [%r295+14592]; + .loc 1 45 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31 + mul.wide.s32 %rd34, %r91, 2; + add.s64 %rd10, %rd29, %rd34; + .loc 1 45 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + mov.u32 %r29, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd10 + 0 ], %rd11; + // end inline asm + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r30, %r5; + mov.u32 %r31, %r5; + mov.u32 %r32, %r5; + mov.u32 %r33, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd10 + 0 ], %rd12; + // end inline asm + .loc 1 54 52 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:52 + add.s32 %r298, %r106, -3145728; + .loc 1 54 45 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45 + add.s32 %r299, %r104, %r298; + add.s32 %r300, %r105, %r298; + .loc 1 54 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31 + mad.wide.s32 %rd13, %r299, 2, %rd30; + mad.wide.s32 %rd15, %r300, 2, %rd30; + .loc 1 54 83 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83 + add.s32 %r301, %r78, -8192; + setp.lt.u32 %p10, %r301, 65536; + and.pred %p3, %p6, %p10; + and.pred %p4, %p7, %p10; + .loc 1 54 67 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67 + // begin inline asm + mov.u64 %rd14, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r34, %r5; + mov.u32 %r35, %r5; + mov.u32 %r36, %r5; + mov.u32 %r37, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd13 + 0 ], %rd14; + // end inline asm + // begin inline asm + mov.u64 %rd16, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r38, %r5; + mov.u32 %r39, %r5; + mov.u32 %r40, %r5; + mov.u32 %r41, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r38, %r39, %r40, %r41 }, [ %rd15 + 0 ], %rd16; + // end inline asm + prmt.b32 %r302, %r34, %r38, 0x7632U; + prmt.b32 %r303, %r35, %r39, 0x7632U; + prmt.b32 %r304, %r36, %r40, 0x7632U; + prmt.b32 %r305, %r37, %r41, 0x7632U; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + bar.sync 0; + prmt.b32 %r306, %r34, %r38, 0x5410U; + prmt.b32 %r307, %r35, %r39, 0x5410U; + st.shared.v4.b32 [%r130], {%r306, %r302, %r307, %r303}; + prmt.b32 %r308, %r36, %r40, 0x5410U; + prmt.b32 %r309, %r37, %r41, 0x5410U; + st.shared.v4.b32 [%r130+256], {%r308, %r304, %r309, %r305}; + bar.sync 0; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r310, %r311, %r312, %r313}, [%r144]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r314, %r315, %r316, %r317}, [%r144+512]; + .loc 1 56 52 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52 + add.s32 %r318, %r85, -8192; + .loc 1 56 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31 + mad.wide.s32 %rd17, %r318, 4, %rd31; + .loc 1 56 90 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r42, %r5; + mov.u32 %r43, %r5; + mov.u32 %r44, %r5; + mov.u32 %r45, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd17 + 0 ], %rd18; + // end inline asm + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r46, %r5; + mov.u32 %r47, %r5; + mov.u32 %r48, %r5; + mov.u32 %r49, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd17 + 0 ], %rd19; + // end inline asm + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r50, %r5; + mov.u32 %r51, %r5; + mov.u32 %r52, %r5; + mov.u32 %r53, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd17 + 0 ], %rd20; + // end inline asm + // begin inline asm + mov.u64 %rd21, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r54, %r5; + mov.u32 %r55, %r5; + mov.u32 %r56, %r5; + mov.u32 %r57, %r5; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd17 + 0 ], %rd21; + // end inline asm + .loc 1 58 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21 + div.full.f32 %r319, %r42, %r169; + div.full.f32 %r320, %r43, %r169; + div.full.f32 %r321, %r44, %r169; + div.full.f32 %r322, %r45, %r169; + div.full.f32 %r323, %r46, %r169; + div.full.f32 %r324, %r47, %r169; + div.full.f32 %r325, %r48, %r169; + div.full.f32 %r326, %r49, %r169; + div.full.f32 %r327, %r50, %r169; + div.full.f32 %r328, %r51, %r169; + div.full.f32 %r329, %r52, %r169; + div.full.f32 %r330, %r53, %r169; + div.full.f32 %r331, %r54, %r169; + div.full.f32 %r332, %r55, %r169; + div.full.f32 %r333, %r56, %r169; + div.full.f32 %r334, %r57, %r169; + .loc 1 60 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20 + add.f32 %r335, %r319, 0f358637BD; + add.f32 %r336, %r320, 0f358637BD; + add.f32 %r337, %r321, 0f358637BD; + add.f32 %r338, %r322, 0f358637BD; + add.f32 %r339, %r323, 0f358637BD; + add.f32 %r340, %r324, 0f358637BD; + add.f32 %r341, %r325, 0f358637BD; + add.f32 %r342, %r326, 0f358637BD; + add.f32 %r343, %r327, 0f358637BD; + add.f32 %r344, %r328, 0f358637BD; + add.f32 %r345, %r329, 0f358637BD; + add.f32 %r346, %r330, 0f358637BD; + add.f32 %r347, %r331, 0f358637BD; + add.f32 %r348, %r332, 0f358637BD; + add.f32 %r349, %r333, 0f358637BD; + add.f32 %r350, %r334, 0f358637BD; + .loc 1 61 28 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28 + rsqrt.approx.ftz.f32 %r351, %r335; + rsqrt.approx.ftz.f32 %r352, %r336; + rsqrt.approx.ftz.f32 %r353, %r337; + rsqrt.approx.ftz.f32 %r354, %r338; + rsqrt.approx.ftz.f32 %r355, %r339; + rsqrt.approx.ftz.f32 %r356, %r340; + rsqrt.approx.ftz.f32 %r357, %r341; + rsqrt.approx.ftz.f32 %r358, %r342; + rsqrt.approx.ftz.f32 %r359, %r343; + rsqrt.approx.ftz.f32 %r360, %r344; + rsqrt.approx.ftz.f32 %r361, %r345; + rsqrt.approx.ftz.f32 %r362, %r346; + rsqrt.approx.ftz.f32 %r363, %r347; + rsqrt.approx.ftz.f32 %r364, %r348; + rsqrt.approx.ftz.f32 %r365, %r349; + rsqrt.approx.ftz.f32 %r366, %r350; + .loc 1 54 134 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134 + mov.b32 {%rs17, %rs18}, %r317; + cvt.f32.bf16 %r367, %rs18; + cvt.f32.bf16 %r368, %rs17; + mov.b32 {%rs19, %rs20}, %r316; + cvt.f32.bf16 %r369, %rs20; + cvt.f32.bf16 %r370, %rs19; + mov.b32 {%rs21, %rs22}, %r315; + cvt.f32.bf16 %r371, %rs22; + cvt.f32.bf16 %r372, %rs21; + mov.b32 {%rs23, %rs24}, %r314; + cvt.f32.bf16 %r373, %rs24; + cvt.f32.bf16 %r374, %rs23; + mov.b32 {%rs25, %rs26}, %r313; + cvt.f32.bf16 %r375, %rs26; + cvt.f32.bf16 %r376, %rs25; + mov.b32 {%rs27, %rs28}, %r312; + cvt.f32.bf16 %r377, %rs28; + cvt.f32.bf16 %r378, %rs27; + mov.b32 {%rs29, %rs30}, %r311; + cvt.f32.bf16 %r379, %rs30; + cvt.f32.bf16 %r380, %rs29; + mov.b32 {%rs31, %rs32}, %r310; + cvt.f32.bf16 %r381, %rs32; + cvt.f32.bf16 %r382, %rs31; + .loc 1 23 21 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21 + setp.lt.s32 %p11, %r83, 73728; + .loc 1 62 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20 + mul.f32 %r383, %r351, %r382; + mul.f32 %r384, %r352, %r381; + mul.f32 %r385, %r353, %r380; + mul.f32 %r386, %r354, %r379; + mul.f32 %r387, %r355, %r378; + mul.f32 %r388, %r356, %r377; + mul.f32 %r389, %r357, %r376; + mul.f32 %r390, %r358, %r375; + mul.f32 %r391, %r359, %r374; + mul.f32 %r392, %r360, %r373; + mul.f32 %r393, %r361, %r372; + mul.f32 %r394, %r362, %r371; + mul.f32 %r395, %r363, %r370; + mul.f32 %r396, %r364, %r369; + mul.f32 %r397, %r365, %r368; + mul.f32 %r398, %r366, %r367; + bar.sync 0; + st.shared.b32 [%r241], %r383; + st.shared.b32 [%r241+128], %r385; + st.shared.b32 [%r243+512], %r384; + st.shared.b32 [%r243+640], %r386; + st.shared.b32 [%r245], %r387; + st.shared.b32 [%r245+128], %r389; + st.shared.b32 [%r247+512], %r388; + st.shared.b32 [%r247+640], %r390; + st.shared.b32 [%r249], %r391; + st.shared.b32 [%r249+128], %r393; + st.shared.b32 [%r251+512], %r392; + st.shared.b32 [%r251+640], %r394; + st.shared.b32 [%r253], %r395; + st.shared.b32 [%r253+128], %r397; + st.shared.b32 [%r255+512], %r396; + st.shared.b32 [%r255+640], %r398; + bar.sync 0; + ld.shared.b32 %r399, [%r267]; + ld.shared.b32 %r400, [%r267+256]; + ld.shared.b32 %r401, [%r271+4096]; + ld.shared.b32 %r402, [%r271+4352]; + ld.shared.b32 %r403, [%r275+8192]; + ld.shared.b32 %r404, [%r275+8448]; + ld.shared.b32 %r405, [%r279+12288]; + ld.shared.b32 %r406, [%r279+12544]; + ld.shared.b32 %r407, [%r283+2048]; + ld.shared.b32 %r408, [%r283+2304]; + ld.shared.b32 %r409, [%r287+6144]; + ld.shared.b32 %r410, [%r287+6400]; + ld.shared.b32 %r411, [%r291+10240]; + ld.shared.b32 %r412, [%r291+10496]; + ld.shared.b32 %r413, [%r295+14336]; + ld.shared.b32 %r414, [%r295+14592]; + .loc 1 63 31 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31 + add.s64 %rd22, %rd32, %rd34; + .loc 1 63 71 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r58, %r5; + mov.u32 %r59, %r5; + mov.u32 %r60, %r5; + mov.u32 %r61, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd22 + 0 ], %rd23; + // end inline asm + // begin inline asm + mov.u64 %rd24, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r62, %r5; + mov.u32 %r63, %r5; + mov.u32 %r64, %r5; + mov.u32 %r65, %r5; + @%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r62, %r63, %r64, %r65 }, [ %rd22 + 0 ], %rd24; + // end inline asm + .loc 1 70 34 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34 + shl.b32 %r415, %r83, 7; + shl.b32 %r416, %r84, 7; + .loc 1 70 30 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30 + add.s32 %r417, %r415, %r91; + add.s32 %r418, %r416, %r91; + .loc 1 70 25 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25 + mad.wide.s32 %rd25, %r417, 2, %rd33; + mad.wide.s32 %rd26, %r418, 2, %rd33; + .loc 1 70 54 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54 + and.pred %p5, %p6, %p11; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs33, %rs34}, %r26; + cvt.f32.bf16 %r419, %rs33; + cvt.f32.bf16 %r420, %rs34; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r421, %r272, %r420; + mul.f32 %r422, %r268, %r419; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs35, %rs36}, %r58; + cvt.f32.bf16 %r423, %rs35; + cvt.f32.bf16 %r424, %rs36; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r425, %r401, %r424; + mul.f32 %r426, %r399, %r423; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r427, %r422, %r426, %p8; + selp.f32 %r428, %r421, %r425, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r66, %r428, %r427; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs37, %rs38}, %r27; + cvt.f32.bf16 %r429, %rs37; + cvt.f32.bf16 %r430, %rs38; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r431, %r280, %r430; + mul.f32 %r432, %r276, %r429; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs39, %rs40}, %r59; + cvt.f32.bf16 %r433, %rs39; + cvt.f32.bf16 %r434, %rs40; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r435, %r405, %r434; + mul.f32 %r436, %r403, %r433; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r437, %r432, %r436, %p8; + selp.f32 %r438, %r431, %r435, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r67, %r438, %r437; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs41, %rs42}, %r28; + cvt.f32.bf16 %r439, %rs41; + cvt.f32.bf16 %r440, %rs42; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r441, %r273, %r440; + mul.f32 %r442, %r269, %r439; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs43, %rs44}, %r60; + cvt.f32.bf16 %r443, %rs43; + cvt.f32.bf16 %r444, %rs44; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r445, %r402, %r444; + mul.f32 %r446, %r400, %r443; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r447, %r442, %r446, %p8; + selp.f32 %r448, %r441, %r445, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r68, %r448, %r447; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs45, %rs46}, %r29; + cvt.f32.bf16 %r449, %rs45; + cvt.f32.bf16 %r450, %rs46; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r451, %r281, %r450; + mul.f32 %r452, %r277, %r449; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs47, %rs48}, %r61; + cvt.f32.bf16 %r453, %rs47; + cvt.f32.bf16 %r454, %rs48; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r455, %r406, %r454; + mul.f32 %r456, %r404, %r453; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r457, %r452, %r456, %p8; + selp.f32 %r458, %r451, %r455, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r69, %r458, %r457; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs49, %rs50}, %r30; + cvt.f32.bf16 %r459, %rs49; + cvt.f32.bf16 %r460, %rs50; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r461, %r288, %r460; + mul.f32 %r462, %r284, %r459; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs51, %rs52}, %r62; + cvt.f32.bf16 %r463, %rs51; + cvt.f32.bf16 %r464, %rs52; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r465, %r409, %r464; + mul.f32 %r466, %r407, %r463; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r467, %r462, %r466, %p8; + selp.f32 %r468, %r461, %r465, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r70, %r468, %r467; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs53, %rs54}, %r31; + cvt.f32.bf16 %r469, %rs53; + cvt.f32.bf16 %r470, %rs54; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r471, %r296, %r470; + mul.f32 %r472, %r292, %r469; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs55, %rs56}, %r63; + cvt.f32.bf16 %r473, %rs55; + cvt.f32.bf16 %r474, %rs56; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r475, %r413, %r474; + mul.f32 %r476, %r411, %r473; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r477, %r472, %r476, %p8; + selp.f32 %r478, %r471, %r475, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r71, %r478, %r477; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs57, %rs58}, %r32; + cvt.f32.bf16 %r479, %rs57; + cvt.f32.bf16 %r480, %rs58; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r481, %r289, %r480; + mul.f32 %r482, %r285, %r479; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs59, %rs60}, %r64; + cvt.f32.bf16 %r483, %rs59; + cvt.f32.bf16 %r484, %rs60; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r485, %r410, %r484; + mul.f32 %r486, %r408, %r483; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r487, %r482, %r486, %p8; + selp.f32 %r488, %r481, %r485, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r72, %r488, %r487; + .loc 1 45 137 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137 + mov.b32 {%rs61, %rs62}, %r33; + cvt.f32.bf16 %r489, %rs61; + cvt.f32.bf16 %r490, %rs62; + .loc 1 47 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20 + mul.f32 %r491, %r297, %r490; + mul.f32 %r492, %r293, %r489; + .loc 1 63 138 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138 + mov.b32 {%rs63, %rs64}, %r65; + cvt.f32.bf16 %r493, %rs63; + cvt.f32.bf16 %r494, %rs64; + .loc 1 65 20 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20 + mul.f32 %r495, %r414, %r494; + mul.f32 %r496, %r412, %r493; + .loc 1 0 0 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0 + selp.f32 %r497, %r492, %r496, %p8; + selp.f32 %r498, %r491, %r495, %p8; + .loc 1 70 46 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46 + cvt.rn.bf16x2.f32 %r73, %r498, %r497; + // begin inline asm + @%p5 st.global.v4.b32 [ %rd25 + 0 ], { %r66, %r67, %r68, %r69 }; + // end inline asm + // begin inline asm + @%p5 st.global.v4.b32 [ %rd26 + 0 ], { %r70, %r71, %r72, %r73 }; + // end inline asm + .loc 1 70 4 // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 50 +.b8 104 +.b8 105 +.b8 106 +.b8 51 +.b8 104 +.b8 109 +.b8 108 +.b8 111 +.b8 117 +.b8 109 +.b8 120 +.b8 100 +.b8 109 +.b8 104 +.b8 117 +.b8 101 +.b8 122 +.b8 115 +.b8 121 +.b8 104 +.b8 107 +.b8 109 +.b8 110 +.b8 113 +.b8 103 +.b8 110 +.b8 102 +.b8 97 +.b8 53 +.b8 105 +.b8 118 +.b8 114 +.b8 101 +.b8 50 +.b8 55 +.b8 117 +.b8 111 +.b8 115 +.b8 121 +.b8 109 +.b8 97 +.b8 109 +.b8 51 +.b8 100 +.b8 114 +.b8 55 +.b8 97 +.b8 53 +.b8 120 +.b8 98 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 50 +.b8 104 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.source new file mode 100644 index 0000000000000000000000000000000000000000..8267d145edfd1392500dd03aa3358250d2fb2971 --- /dev/null +++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.source @@ -0,0 +1,415 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc99 = loc("in_ptr0"(#loc)) +#loc100 = loc("in_ptr1"(#loc)) +#loc101 = loc("in_ptr2"(#loc)) +#loc102 = loc("in_ptr3"(#loc)) +#loc103 = loc("in_ptr4"(#loc)) +#loc104 = loc("in_ptr5"(#loc)) +#loc105 = loc("out_ptr0"(#loc)) +#loc106 = loc("ynumel"(#loc)) +#loc107 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %ynumel_0 = arith.constant 73728 : i32 loc(#loc108) + %xnumel_1 = arith.constant 128 : i32 loc(#loc109) + %yoffset = tt.get_program_id y : i32 loc(#loc110) + %yoffset_2 = tt.get_program_id z : i32 loc(#loc111) + %yoffset_3 = tt.get_num_programs y : i32 loc(#loc112) + %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc113) + %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc114) + %yoffset_6 = arith.constant 32 : i32 loc(#loc115) + %yoffset_7 = arith.constant 32 : i32 loc(#loc115) + %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc115) + %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc116) + %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc117) + %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<32x1xi32> loc(#loc118) + %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<32x1xi32> loc(#loc118) + %ymask = arith.constant dense<73728> : tensor<32x1xi32> loc(#loc119) + %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<32x1xi32> loc(#loc119) + %xoffset = tt.get_program_id x : i32 loc(#loc120) + %xoffset_13 = arith.constant 128 : i32 loc(#loc121) + %xoffset_14 = arith.constant 128 : i32 loc(#loc121) + %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc121) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc122) + %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc123) + %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x128xi32> loc(#loc124) + %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x128xi32> loc(#loc124) + %xmask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc125) + %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x128xi32> loc(#loc125) + %y1 = arith.constant 32 : i32 loc(#loc126) + %y1_20 = arith.constant 32 : i32 loc(#loc126) + %y1_21 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc126) + %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<32x1xi32> loc(#loc126) + %y0 = arith.constant 32 : i32 loc(#loc127) + %y0_23 = arith.constant 32 : i32 loc(#loc127) + %y0_24 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc127) + %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<32x1xi32> loc(#loc127) + %tmp1 = arith.constant 0 : i64 loc(#loc128) + %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc128) + %tmp2 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc129) + %tmp2_27 = arith.constant dense<0> : tensor<32x1xi64> loc(#loc129) + %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<32x1xi64> loc(#loc129) + %tmp3 = arith.constant 256 : i64 loc(#loc130) + %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc130) + %tmp4 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc131) + %tmp4_30 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc131) + %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<32x1xi64> loc(#loc131) + %tmp5 = arith.constant 128 : i32 loc(#loc132) + %tmp5_32 = arith.constant 128 : i32 loc(#loc132) + %tmp5_33 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc132) + %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<32x1xi32> loc(#loc132) + %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc133) + %tmp5_36 = tt.broadcast %tmp5_34 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc133) + %tmp5_37 = arith.addi %tmp5_35, %tmp5_36 : tensor<32x128xi32> loc(#loc133) + %tmp5_38 = arith.constant 12288 : i32 loc(#loc134) + %tmp5_39 = arith.constant 12288 : i32 loc(#loc134) + %tmp5_40 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc134) + %tmp5_41 = arith.muli %tmp5_40, %y1_22 : tensor<32x1xi32> loc(#loc134) + %tmp5_42 = tt.broadcast %tmp5_41 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc135) + %tmp5_43 = arith.addi %tmp5_37, %tmp5_42 : tensor<32x128xi32> loc(#loc135) + %tmp5_44 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x128x!tt.ptr> loc(#loc136) + %tmp5_45 = tt.addptr %tmp5_44, %tmp5_43 : tensor<32x128x!tt.ptr>, tensor<32x128xi32> loc(#loc136) + %tmp5_46 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc137) + %tmp5_47 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc137) + %tmp5_48 = arith.andi %tmp5_46, %tmp5_47 : tensor<32x128xi1> loc(#loc137) + %tmp5_49 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc138) + %tmp5_50 = arith.andi %tmp5_48, %tmp5_49 : tensor<32x128xi1> loc(#loc138) + %tmp5_51 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp5_52 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc139) + %tmp5_53 = arith.truncf %tmp5_52 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc139) + %tmp5_54 = tt.load %tmp5_45, %tmp5_50, %tmp5_53 evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc139) + %tmp5_55 = arith.extf %tmp5_54 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc140) + %tmp7 = arith.constant 32 : i32 loc(#loc141) + %tmp7_56 = arith.constant 32 : i32 loc(#loc141) + %tmp7_57 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc141) + %tmp7_58 = arith.muli %tmp7_57, %y1_22 : tensor<32x1xi32> loc(#loc141) + %tmp7_59 = arith.addi %y0_25, %tmp7_58 : tensor<32x1xi32> loc(#loc142) + %tmp7_60 = tt.broadcast %tmp7_59 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc143) + %tmp7_61 = tt.splat %in_ptr1 : !tt.ptr -> tensor<32x128x!tt.ptr> loc(#loc144) + %tmp7_62 = tt.addptr %tmp7_61, %tmp7_60 : tensor<32x128x!tt.ptr>, tensor<32x128xi32> loc(#loc144) + %tmp7_63 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc145) + %tmp7_64 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc145) + %tmp7_65 = arith.andi %tmp7_63, %tmp7_64 : tensor<32x128xi1> loc(#loc145) + %tmp7_66 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc146) + %tmp7_67 = arith.andi %tmp7_65, %tmp7_66 : tensor<32x128xi1> loc(#loc146) + %tmp7_68 = arith.constant 0.000000e+00 : f32 loc(#loc147) + %tmp7_69 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc147) + %tmp7_70 = tt.load %tmp7_62, %tmp7_67, %tmp7_69 evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc147) + %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc148) + %tmp9 = arith.constant dense<1.280000e+02> : tensor<32x128xf32> loc(#loc149) + %tmp9_71 = arith.divf %tmp7_70, %tmp9 : tensor<32x128xf32> loc(#loc149) + %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc150) + %tmp11 = arith.constant dense<9.99999997E-7> : tensor<32x128xf32> loc(#loc151) + %tmp11_72 = arith.addf %tmp9_71, %tmp11 : tensor<32x128xf32> loc(#loc151) + %tmp12 = tt.extern_elementwise %tmp11_72 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32>) -> tensor<32x128xf32> loc(#loc152) + %tmp13 = arith.mulf %tmp5_55, %tmp12 : tensor<32x128xf32> loc(#loc153) + %tmp14 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc154) + %tmp14_73 = tt.splat %in_ptr2 : !tt.ptr -> tensor<32x128x!tt.ptr> loc(#loc155) + %tmp14_74 = tt.addptr %tmp14_73, %tmp14 : tensor<32x128x!tt.ptr>, tensor<32x128xi32> loc(#loc155) + %tmp14_75 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc156) + %tmp14_76 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc156) + %tmp14_77 = arith.andi %tmp14_75, %tmp14_76 : tensor<32x128xi1> loc(#loc156) + %tmp14_78 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc157) + %tmp14_79 = arith.andi %tmp14_77, %tmp14_78 : tensor<32x128xi1> loc(#loc157) + %tmp14_80 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp14_81 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc158) + %tmp14_82 = arith.truncf %tmp14_81 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc158) + %tmp14_83 = tt.load %tmp14_74, %tmp14_79, %tmp14_82 evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc158) + %tmp14_84 = arith.extf %tmp14_83 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc159) + %tmp16 = arith.mulf %tmp13, %tmp14_84 : tensor<32x128xf32> loc(#loc160) + %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc161) + %tmp18_85 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc161) + %tmp19 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc162) + %tmp19_86 = arith.select %tmp19, %tmp16, %tmp18_85 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc162) + %tmp20 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc163) + %tmp20_87 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc163) + %tmp20_88 = arith.cmpi sge, %tmp20, %tmp20_87 : tensor<32x1xi64> loc(#loc163) + %tmp21 = arith.constant 2304 : i64 loc(#loc164) + %tmp21_89 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc164) + %tmp22 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc165) + %tmp22_90 = arith.constant dense<2304> : tensor<32x1xi64> loc(#loc165) + %tmp22_91 = arith.cmpi slt, %tmp22, %tmp22_90 : tensor<32x1xi64> loc(#loc165) + %tmp23 = arith.constant 128 : i32 loc(#loc166) + %tmp23_92 = arith.constant 128 : i32 loc(#loc166) + %tmp23_93 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc166) + %tmp23_94 = arith.muli %tmp23_93, %y0_25 : tensor<32x1xi32> loc(#loc166) + %tmp23_95 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc167) + %tmp23_96 = tt.broadcast %tmp23_94 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc167) + %tmp23_97 = arith.addi %tmp23_95, %tmp23_96 : tensor<32x128xi32> loc(#loc167) + %tmp23_98 = arith.constant -256 : i32 loc(#loc168) + %tmp23_99 = arith.constant -256 : i32 loc(#loc168) + %tmp23_100 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc168) + %tmp23_101 = arith.addi %tmp23_100, %y1_22 : tensor<32x1xi32> loc(#loc168) + %tmp23_102 = arith.constant 12288 : i32 loc(#loc169) + %tmp23_103 = arith.constant 12288 : i32 loc(#loc169) + %tmp23_104 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc169) + %tmp23_105 = arith.muli %tmp23_104, %tmp23_101 : tensor<32x1xi32> loc(#loc169) + %tmp23_106 = tt.broadcast %tmp23_105 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc170) + %tmp23_107 = arith.addi %tmp23_97, %tmp23_106 : tensor<32x128xi32> loc(#loc170) + %tmp23_108 = tt.splat %in_ptr3 : !tt.ptr -> tensor<32x128x!tt.ptr> loc(#loc171) + %tmp23_109 = tt.addptr %tmp23_108, %tmp23_107 : tensor<32x128x!tt.ptr>, tensor<32x128xi32> loc(#loc171) + %tmp23_110 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc172) + %tmp23_111 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc172) + %tmp23_112 = arith.andi %tmp23_110, %tmp23_111 : tensor<32x128xi1> loc(#loc172) + %tmp23_113 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc173) + %tmp23_114 = arith.andi %tmp23_112, %tmp23_113 : tensor<32x128xi1> loc(#loc173) + %tmp23_115 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %tmp23_116 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc174) + %tmp23_117 = arith.truncf %tmp23_116 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc174) + %tmp23_118 = tt.load %tmp23_109, %tmp23_114, %tmp23_117 evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc174) + %tmp23_119 = arith.extf %tmp23_118 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc175) + %tmp25 = arith.constant -256 : i32 loc(#loc176) + %tmp25_120 = arith.constant -256 : i32 loc(#loc176) + %tmp25_121 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc176) + %tmp25_122 = arith.addi %tmp25_121, %y1_22 : tensor<32x1xi32> loc(#loc176) + %tmp25_123 = arith.constant 32 : i32 loc(#loc177) + %tmp25_124 = arith.constant 32 : i32 loc(#loc177) + %tmp25_125 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc177) + %tmp25_126 = arith.muli %tmp25_125, %tmp25_122 : tensor<32x1xi32> loc(#loc177) + %tmp25_127 = arith.addi %y0_25, %tmp25_126 : tensor<32x1xi32> loc(#loc178) + %tmp25_128 = tt.broadcast %tmp25_127 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc179) + %tmp25_129 = tt.splat %in_ptr4 : !tt.ptr -> tensor<32x128x!tt.ptr> loc(#loc180) + %tmp25_130 = tt.addptr %tmp25_129, %tmp25_128 : tensor<32x128x!tt.ptr>, tensor<32x128xi32> loc(#loc180) + %tmp25_131 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc181) + %tmp25_132 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc181) + %tmp25_133 = arith.andi %tmp25_131, %tmp25_132 : tensor<32x128xi1> loc(#loc181) + %tmp25_134 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc182) + %tmp25_135 = arith.andi %tmp25_133, %tmp25_134 : tensor<32x128xi1> loc(#loc182) + %tmp25_136 = arith.constant 0.000000e+00 : f32 loc(#loc183) + %tmp25_137 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc183) + %tmp25_138 = tt.load %tmp25_130, %tmp25_135, %tmp25_137 evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc183) + %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc184) + %tmp27 = arith.constant dense<1.280000e+02> : tensor<32x128xf32> loc(#loc185) + %tmp27_139 = arith.divf %tmp25_138, %tmp27 : tensor<32x128xf32> loc(#loc185) + %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc186) + %tmp29 = arith.constant dense<9.99999997E-7> : tensor<32x128xf32> loc(#loc187) + %tmp29_140 = arith.addf %tmp27_139, %tmp29 : tensor<32x128xf32> loc(#loc187) + %tmp30 = tt.extern_elementwise %tmp29_140 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32>) -> tensor<32x128xf32> loc(#loc188) + %tmp31 = arith.mulf %tmp23_119, %tmp30 : tensor<32x128xf32> loc(#loc189) + %tmp32 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc190) + %tmp32_141 = tt.splat %in_ptr5 : !tt.ptr -> tensor<32x128x!tt.ptr> loc(#loc191) + %tmp32_142 = tt.addptr %tmp32_141, %tmp32 : tensor<32x128x!tt.ptr>, tensor<32x128xi32> loc(#loc191) + %tmp32_143 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc192) + %tmp32_144 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc192) + %tmp32_145 = arith.andi %tmp32_143, %tmp32_144 : tensor<32x128xi1> loc(#loc192) + %tmp32_146 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc193) + %tmp32_147 = arith.andi %tmp32_145, %tmp32_146 : tensor<32x128xi1> loc(#loc193) + %tmp32_148 = arith.constant 0.000000e+00 : f32 loc(#loc194) + %tmp32_149 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc194) + %tmp32_150 = arith.truncf %tmp32_149 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc194) + %tmp32_151 = tt.load %tmp32_142, %tmp32_147, %tmp32_150 evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc194) + %tmp32_152 = arith.extf %tmp32_151 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc195) + %tmp34 = arith.mulf %tmp31, %tmp32_152 : tensor<32x128xf32> loc(#loc196) + %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc197) + %tmp36_153 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc197) + %tmp37 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc198) + %tmp37_154 = arith.select %tmp37, %tmp34, %tmp36_153 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc198) + %tmp38 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc199) + %tmp38_155 = arith.select %tmp38, %tmp19_86, %tmp37_154 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc199) + %c128_i32 = arith.constant 128 : i32 loc(#loc93) + %c128_i32_156 = arith.constant 128 : i32 loc(#loc93) + %cst = arith.constant dense<128> : tensor<32x1xi32> loc(#loc93) + %0 = arith.muli %cst, %yindex_11 : tensor<32x1xi32> loc(#loc93) + %1 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc94) + %2 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc94) + %3 = arith.addi %1, %2 : tensor<32x128xi32> loc(#loc94) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<32x128x!tt.ptr> loc(#loc95) + %5 = tt.addptr %4, %3 : tensor<32x128x!tt.ptr>, tensor<32x128xi32> loc(#loc95) + %6 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc96) + %7 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc96) + %8 = arith.andi %6, %7 : tensor<32x128xi1> loc(#loc96) + %9 = arith.truncf %tmp38_155 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc97) + tt.store %5, %9, %8 : tensor<32x128x!tt.ptr> loc(#loc97) + tt.return loc(#loc98) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:60) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:70) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc108 = loc("ynumel"(#loc1)) +#loc109 = loc("xnumel"(#loc2)) +#loc110 = loc("yoffset"(#loc3)) +#loc111 = loc("yoffset"(#loc4)) +#loc112 = loc("yoffset"(#loc5)) +#loc113 = loc("yoffset"(#loc6)) +#loc114 = loc("yoffset"(#loc7)) +#loc115 = loc("yoffset"(#loc8)) +#loc116 = loc("yindex"(#loc9)) +#loc117 = loc("yindex"(#loc10)) +#loc118 = loc("yindex"(#loc11)) +#loc119 = loc("ymask"(#loc12)) +#loc120 = loc("xoffset"(#loc13)) +#loc121 = loc("xoffset"(#loc14)) +#loc122 = loc("xindex"(#loc15)) +#loc123 = loc("xindex"(#loc16)) +#loc124 = loc("xindex"(#loc17)) +#loc125 = loc("xmask"(#loc18)) +#loc126 = loc("y1"(#loc19)) +#loc127 = loc("y0"(#loc20)) +#loc128 = loc("tmp1"(#loc21)) +#loc129 = loc("tmp2"(#loc22)) +#loc130 = loc("tmp3"(#loc23)) +#loc131 = loc("tmp4"(#loc24)) +#loc132 = loc("tmp5"(#loc25)) +#loc133 = loc("tmp5"(#loc26)) +#loc134 = loc("tmp5"(#loc27)) +#loc135 = loc("tmp5"(#loc28)) +#loc136 = loc("tmp5"(#loc29)) +#loc137 = loc("tmp5"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp7"(#loc34)) +#loc142 = loc("tmp7"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp7"(#loc37)) +#loc145 = loc("tmp7"(#loc38)) +#loc146 = loc("tmp7"(#loc39)) +#loc147 = loc("tmp7"(#loc40)) +#loc148 = loc("tmp8"(#loc41)) +#loc149 = loc("tmp9"(#loc42)) +#loc150 = loc("tmp10"(#loc43)) +#loc151 = loc("tmp11"(#loc44)) +#loc152 = loc("tmp12"(#loc45)) +#loc153 = loc("tmp13"(#loc46)) +#loc154 = loc("tmp14"(#loc47)) +#loc155 = loc("tmp14"(#loc48)) +#loc156 = loc("tmp14"(#loc49)) +#loc157 = loc("tmp14"(#loc50)) +#loc158 = loc("tmp14"(#loc51)) +#loc159 = loc("tmp14"(#loc52)) +#loc160 = loc("tmp16"(#loc53)) +#loc161 = loc("tmp18"(#loc54)) +#loc162 = loc("tmp19"(#loc55)) +#loc163 = loc("tmp20"(#loc56)) +#loc164 = loc("tmp21"(#loc57)) +#loc165 = loc("tmp22"(#loc58)) +#loc166 = loc("tmp23"(#loc59)) +#loc167 = loc("tmp23"(#loc60)) +#loc168 = loc("tmp23"(#loc61)) +#loc169 = loc("tmp23"(#loc62)) +#loc170 = loc("tmp23"(#loc63)) +#loc171 = loc("tmp23"(#loc64)) +#loc172 = loc("tmp23"(#loc65)) +#loc173 = loc("tmp23"(#loc66)) +#loc174 = loc("tmp23"(#loc67)) +#loc175 = loc("tmp23"(#loc68)) +#loc176 = loc("tmp25"(#loc69)) +#loc177 = loc("tmp25"(#loc70)) +#loc178 = loc("tmp25"(#loc71)) +#loc179 = loc("tmp25"(#loc72)) +#loc180 = loc("tmp25"(#loc73)) +#loc181 = loc("tmp25"(#loc74)) +#loc182 = loc("tmp25"(#loc75)) +#loc183 = loc("tmp25"(#loc76)) +#loc184 = loc("tmp26"(#loc77)) +#loc185 = loc("tmp27"(#loc78)) +#loc186 = loc("tmp28"(#loc79)) +#loc187 = loc("tmp29"(#loc80)) +#loc188 = loc("tmp30"(#loc81)) +#loc189 = loc("tmp31"(#loc82)) +#loc190 = loc("tmp32"(#loc83)) +#loc191 = loc("tmp32"(#loc84)) +#loc192 = loc("tmp32"(#loc85)) +#loc193 = loc("tmp32"(#loc86)) +#loc194 = loc("tmp32"(#loc87)) +#loc195 = loc("tmp32"(#loc88)) +#loc196 = loc("tmp34"(#loc89)) +#loc197 = loc("tmp36"(#loc90)) +#loc198 = loc("tmp37"(#loc91)) +#loc199 = loc("tmp38"(#loc92)) diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..4d241fbddba3123622164ff35df2e35ea9a08d49 --- /dev/null +++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir @@ -0,0 +1,288 @@ +#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("in_ptr4"(#loc)) +#loc75 = loc("in_ptr5"(#loc)) +#loc76 = loc("out_ptr0"(#loc)) +#loc77 = loc("ynumel"(#loc)) +#loc78 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<-256> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<-256> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<256> : tensor<32x1xi64, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<256> : tensor<32x1xi64, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<32> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<32> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<1x128xi32, #blocked1> loc(#loc1) + %cst_9 = arith.constant dense<73728> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<73728> : tensor<32x1xi32, #blocked1> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<32x128xbf16, #blocked1> loc(#loc1) + %cst_12 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<9.99999997E-7> : tensor<32x128xf32, #blocked> loc(#loc1) + %cst_14 = arith.constant dense<1.280000e+02> : tensor<32x128xf32, #blocked> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked1> loc(#loc1) + %yoffset = tt.get_program_id y : i32 loc(#loc79) + %yoffset_16 = tt.get_program_id z : i32 loc(#loc80) + %yoffset_17 = tt.get_num_programs y : i32 loc(#loc81) + %yoffset_18 = arith.muli %yoffset_16, %yoffset_17 : i32 loc(#loc82) + %yoffset_19 = arith.addi %yoffset, %yoffset_18 : i32 loc(#loc83) + %yoffset_20 = arith.muli %yoffset_19, %c32_i32 : i32 loc(#loc84) + %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc85) + %yindex_21 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc85) + %yindex_22 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> loc(#loc85) + %yindex_23 = tt.expand_dims %yindex_21 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc85) + %yindex_24 = tt.splat %yoffset_20 : i32 -> tensor<32x1xi32, #blocked1> loc(#loc86) + %yindex_25 = tt.splat %yoffset_20 : i32 -> tensor<32x1xi32, #blocked> loc(#loc86) + %yindex_26 = arith.addi %yindex_24, %yindex_22 : tensor<32x1xi32, #blocked1> loc(#loc86) + %yindex_27 = arith.addi %yindex_25, %yindex_23 : tensor<32x1xi32, #blocked> loc(#loc86) + %ymask = arith.cmpi slt, %yindex_26, %cst_10 : tensor<32x1xi32, #blocked1> loc(#loc87) + %ymask_28 = arith.cmpi slt, %yindex_27, %cst_9 : tensor<32x1xi32, #blocked> loc(#loc87) + %xoffset = tt.get_program_id x : i32 loc(#loc88) + %xoffset_29 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc89) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc90) + %xindex_30 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc90) + %xindex_31 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc90) + %xindex_32 = tt.expand_dims %xindex_30 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc90) + %xindex_33 = tt.splat %xoffset_29 : i32 -> tensor<1x128xi32, #blocked1> loc(#loc91) + %xindex_34 = tt.splat %xoffset_29 : i32 -> tensor<1x128xi32, #blocked> loc(#loc91) + %xindex_35 = arith.addi %xindex_33, %xindex_31 : tensor<1x128xi32, #blocked1> loc(#loc91) + %xindex_36 = arith.addi %xindex_34, %xindex_32 : tensor<1x128xi32, #blocked> loc(#loc91) + %xmask = arith.cmpi slt, %xindex_35, %cst_8 : tensor<1x128xi32, #blocked1> loc(#loc92) + %xmask_37 = arith.cmpi slt, %xindex_36, %cst_7 : tensor<1x128xi32, #blocked> loc(#loc92) + %y1 = arith.divsi %yindex_26, %cst_6 : tensor<32x1xi32, #blocked1> loc(#loc93) + %y1_38 = arith.divsi %yindex_27, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc93) + %y0 = arith.remsi %yindex_26, %cst_6 : tensor<32x1xi32, #blocked1> loc(#loc94) + %y0_39 = arith.remsi %yindex_27, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc94) + %tmp4 = arith.extsi %y1 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> loc(#loc95) + %tmp4_40 = arith.extsi %y1_38 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> loc(#loc95) + %tmp4_41 = arith.cmpi slt, %tmp4, %cst_4 : tensor<32x1xi64, #blocked1> loc(#loc95) + %tmp4_42 = arith.cmpi slt, %tmp4_40, %cst_3 : tensor<32x1xi64, #blocked> loc(#loc95) + %tmp5 = arith.muli %y0, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc96) + %tmp5_43 = tt.broadcast %xindex_35 : tensor<1x128xi32, #blocked1> -> tensor<32x128xi32, #blocked1> loc(#loc97) + %tmp5_44 = tt.broadcast %tmp5 : tensor<32x1xi32, #blocked1> -> tensor<32x128xi32, #blocked1> loc(#loc97) + %tmp5_45 = arith.addi %tmp5_43, %tmp5_44 : tensor<32x128xi32, #blocked1> loc(#loc97) + %tmp5_46 = arith.muli %y1, %cst_1 : tensor<32x1xi32, #blocked1> loc(#loc98) + %tmp5_47 = tt.broadcast %tmp5_46 : tensor<32x1xi32, #blocked1> -> tensor<32x128xi32, #blocked1> loc(#loc99) + %tmp5_48 = arith.addi %tmp5_45, %tmp5_47 : tensor<32x128xi32, #blocked1> loc(#loc99) + %tmp5_49 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked1> loc(#loc100) + %tmp5_50 = tt.addptr %tmp5_49, %tmp5_48 : tensor<32x128x!tt.ptr, #blocked1>, tensor<32x128xi32, #blocked1> loc(#loc100) + %tmp5_51 = tt.broadcast %tmp4_41 : tensor<32x1xi1, #blocked1> -> tensor<32x128xi1, #blocked1> loc(#loc101) + %tmp5_52 = tt.broadcast %tmp4_42 : tensor<32x1xi1, #blocked> -> tensor<32x128xi1, #blocked> loc(#loc101) + %tmp5_53 = tt.broadcast %xmask : tensor<1x128xi1, #blocked1> -> tensor<32x128xi1, #blocked1> loc(#loc101) + %tmp5_54 = tt.broadcast %xmask_37 : tensor<1x128xi1, #blocked> -> tensor<32x128xi1, #blocked> loc(#loc101) + %tmp5_55 = arith.andi %tmp5_51, %tmp5_53 : tensor<32x128xi1, #blocked1> loc(#loc101) + %tmp5_56 = arith.andi %tmp5_52, %tmp5_54 : tensor<32x128xi1, #blocked> loc(#loc101) + %tmp5_57 = tt.broadcast %ymask : tensor<32x1xi1, #blocked1> -> tensor<32x128xi1, #blocked1> loc(#loc102) + %tmp5_58 = tt.broadcast %ymask_28 : tensor<32x1xi1, #blocked> -> tensor<32x128xi1, #blocked> loc(#loc102) + %tmp5_59 = arith.andi %tmp5_55, %tmp5_57 : tensor<32x128xi1, #blocked1> loc(#loc102) + %tmp5_60 = arith.andi %tmp5_56, %tmp5_58 : tensor<32x128xi1, #blocked> loc(#loc102) + %tmp5_61 = tt.load %tmp5_50, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<32x128x!tt.ptr, #blocked1> loc(#loc103) + %tmp5_62 = ttg.convert_layout %tmp5_61 : tensor<32x128xbf16, #blocked1> -> tensor<32x128xbf16, #blocked> loc(#loc104) + %tmp5_63 = arith.extf %tmp5_62 : tensor<32x128xbf16, #blocked> to tensor<32x128xf32, #blocked> loc(#loc104) + %tmp7 = arith.muli %y1_38, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc105) + %tmp7_64 = arith.addi %y0_39, %tmp7 : tensor<32x1xi32, #blocked> loc(#loc106) + %tmp7_65 = tt.splat %in_ptr1 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> loc(#loc107) + %tmp7_66 = tt.addptr %tmp7_65, %tmp7_64 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> loc(#loc107) + %tmp7_67 = tt.broadcast %tmp7_66 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x128x!tt.ptr, #blocked> loc(#loc107) + %tmp7_68 = tt.load %tmp7_67, %tmp5_60, %cst_12 evictionPolicy = evict_last : tensor<32x128x!tt.ptr, #blocked> loc(#loc108) + %tmp9 = arith.divf %tmp7_68, %cst_14 : tensor<32x128xf32, #blocked> loc(#loc109) + %tmp11 = arith.addf %tmp9, %cst_13 : tensor<32x128xf32, #blocked> loc(#loc110) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32, #blocked>) -> tensor<32x128xf32, #blocked> loc(#loc111) + %tmp13 = arith.mulf %tmp5_63, %tmp12 : tensor<32x128xf32, #blocked> loc(#loc112) + %tmp13_69 = ttg.convert_layout %tmp13 : tensor<32x128xf32, #blocked> -> tensor<32x128xf32, #blocked1> loc(#loc112) + %tmp14 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked1> loc(#loc113) + %tmp14_70 = tt.addptr %tmp14, %xindex_35 : tensor<1x128x!tt.ptr, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc113) + %tmp14_71 = tt.broadcast %tmp14_70 : tensor<1x128x!tt.ptr, #blocked1> -> tensor<32x128x!tt.ptr, #blocked1> loc(#loc113) + %tmp14_72 = tt.load %tmp14_71, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<32x128x!tt.ptr, #blocked1> loc(#loc114) + %tmp14_73 = arith.extf %tmp14_72 : tensor<32x128xbf16, #blocked1> to tensor<32x128xf32, #blocked1> loc(#loc115) + %tmp16 = arith.mulf %tmp13_69, %tmp14_73 : tensor<32x128xf32, #blocked1> loc(#loc116) + %tmp20 = arith.cmpi sge, %tmp4, %cst_4 : tensor<32x1xi64, #blocked1> loc(#loc117) + %tmp20_74 = arith.cmpi sge, %tmp4_40, %cst_3 : tensor<32x1xi64, #blocked> loc(#loc117) + %tmp23 = arith.addi %y1, %cst_0 : tensor<32x1xi32, #blocked1> loc(#loc118) + %tmp23_75 = arith.addi %y1_38, %cst : tensor<32x1xi32, #blocked> loc(#loc118) + %tmp23_76 = arith.muli %tmp23, %cst_1 : tensor<32x1xi32, #blocked1> loc(#loc119) + %tmp23_77 = tt.broadcast %tmp23_76 : tensor<32x1xi32, #blocked1> -> tensor<32x128xi32, #blocked1> loc(#loc120) + %tmp23_78 = arith.addi %tmp5_45, %tmp23_77 : tensor<32x128xi32, #blocked1> loc(#loc120) + %tmp23_79 = tt.splat %in_ptr3 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked1> loc(#loc121) + %tmp23_80 = tt.addptr %tmp23_79, %tmp23_78 : tensor<32x128x!tt.ptr, #blocked1>, tensor<32x128xi32, #blocked1> loc(#loc121) + %tmp23_81 = tt.broadcast %tmp20 : tensor<32x1xi1, #blocked1> -> tensor<32x128xi1, #blocked1> loc(#loc122) + %tmp23_82 = tt.broadcast %tmp20_74 : tensor<32x1xi1, #blocked> -> tensor<32x128xi1, #blocked> loc(#loc122) + %tmp23_83 = arith.andi %tmp23_81, %tmp5_53 : tensor<32x128xi1, #blocked1> loc(#loc122) + %tmp23_84 = arith.andi %tmp23_82, %tmp5_54 : tensor<32x128xi1, #blocked> loc(#loc122) + %tmp23_85 = arith.andi %tmp23_83, %tmp5_57 : tensor<32x128xi1, #blocked1> loc(#loc123) + %tmp23_86 = arith.andi %tmp23_84, %tmp5_58 : tensor<32x128xi1, #blocked> loc(#loc123) + %tmp23_87 = tt.load %tmp23_80, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<32x128x!tt.ptr, #blocked1> loc(#loc124) + %tmp23_88 = ttg.convert_layout %tmp23_87 : tensor<32x128xbf16, #blocked1> -> tensor<32x128xbf16, #blocked> loc(#loc125) + %tmp23_89 = arith.extf %tmp23_88 : tensor<32x128xbf16, #blocked> to tensor<32x128xf32, #blocked> loc(#loc125) + %tmp25 = arith.muli %tmp23_75, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc126) + %tmp25_90 = arith.addi %y0_39, %tmp25 : tensor<32x1xi32, #blocked> loc(#loc127) + %tmp25_91 = tt.splat %in_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> loc(#loc128) + %tmp25_92 = tt.addptr %tmp25_91, %tmp25_90 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> loc(#loc128) + %tmp25_93 = tt.broadcast %tmp25_92 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x128x!tt.ptr, #blocked> loc(#loc128) + %tmp25_94 = tt.load %tmp25_93, %tmp23_86, %cst_12 evictionPolicy = evict_last : tensor<32x128x!tt.ptr, #blocked> loc(#loc129) + %tmp27 = arith.divf %tmp25_94, %cst_14 : tensor<32x128xf32, #blocked> loc(#loc130) + %tmp29 = arith.addf %tmp27, %cst_13 : tensor<32x128xf32, #blocked> loc(#loc131) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32, #blocked>) -> tensor<32x128xf32, #blocked> loc(#loc132) + %tmp31 = arith.mulf %tmp23_89, %tmp30 : tensor<32x128xf32, #blocked> loc(#loc133) + %tmp31_95 = ttg.convert_layout %tmp31 : tensor<32x128xf32, #blocked> -> tensor<32x128xf32, #blocked1> loc(#loc133) + %tmp32 = tt.splat %in_ptr5 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked1> loc(#loc134) + %tmp32_96 = tt.addptr %tmp32, %xindex_35 : tensor<1x128x!tt.ptr, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc134) + %tmp32_97 = tt.broadcast %tmp32_96 : tensor<1x128x!tt.ptr, #blocked1> -> tensor<32x128x!tt.ptr, #blocked1> loc(#loc134) + %tmp32_98 = tt.load %tmp32_97, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<32x128x!tt.ptr, #blocked1> loc(#loc135) + %tmp32_99 = arith.extf %tmp32_98 : tensor<32x128xbf16, #blocked1> to tensor<32x128xf32, #blocked1> loc(#loc136) + %tmp34 = arith.mulf %tmp31_95, %tmp32_99 : tensor<32x128xf32, #blocked1> loc(#loc137) + %tmp37 = arith.select %tmp23_81, %tmp34, %cst_15 : tensor<32x128xi1, #blocked1>, tensor<32x128xf32, #blocked1> loc(#loc138) + %tmp38 = arith.select %tmp5_51, %tmp16, %tmp37 : tensor<32x128xi1, #blocked1>, tensor<32x128xf32, #blocked1> loc(#loc141) + %0 = arith.muli %yindex_26, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc64) + %1 = tt.broadcast %0 : tensor<32x1xi32, #blocked1> -> tensor<32x128xi32, #blocked1> loc(#loc65) + %2 = arith.addi %tmp5_43, %1 : tensor<32x128xi32, #blocked1> loc(#loc65) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked1> loc(#loc66) + %4 = tt.addptr %3, %2 : tensor<32x128x!tt.ptr, #blocked1>, tensor<32x128xi32, #blocked1> loc(#loc66) + %5 = arith.andi %tmp5_53, %tmp5_57 : tensor<32x128xi1, #blocked1> loc(#loc67) + %6 = arith.truncf %tmp38 : tensor<32x128xf32, #blocked1> to tensor<32x128xbf16, #blocked1> loc(#loc68) + tt.store %4, %6, %5 : tensor<32x128x!tt.ptr, #blocked1> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc79 = loc("yoffset"(#loc2)) +#loc80 = loc("yoffset"(#loc3)) +#loc81 = loc("yoffset"(#loc4)) +#loc82 = loc("yoffset"(#loc5)) +#loc83 = loc("yoffset"(#loc6)) +#loc84 = loc("yoffset"(#loc7)) +#loc85 = loc("yindex"(#loc8)) +#loc86 = loc("yindex"(#loc9)) +#loc87 = loc("ymask"(#loc10)) +#loc88 = loc("xoffset"(#loc11)) +#loc89 = loc("xoffset"(#loc12)) +#loc90 = loc("xindex"(#loc13)) +#loc91 = loc("xindex"(#loc14)) +#loc92 = loc("xmask"(#loc15)) +#loc93 = loc("y1"(#loc16)) +#loc94 = loc("y0"(#loc17)) +#loc95 = loc("tmp4"(#loc18)) +#loc96 = loc("tmp5"(#loc19)) +#loc97 = loc("tmp5"(#loc20)) +#loc98 = loc("tmp5"(#loc21)) +#loc99 = loc("tmp5"(#loc22)) +#loc100 = loc("tmp5"(#loc23)) +#loc101 = loc("tmp5"(#loc24)) +#loc102 = loc("tmp5"(#loc25)) +#loc103 = loc("tmp5"(#loc26)) +#loc104 = loc("tmp5"(#loc27)) +#loc105 = loc("tmp7"(#loc28)) +#loc106 = loc("tmp7"(#loc29)) +#loc107 = loc("tmp7"(#loc30)) +#loc108 = loc("tmp7"(#loc31)) +#loc109 = loc("tmp9"(#loc32)) +#loc110 = loc("tmp11"(#loc33)) +#loc111 = loc("tmp12"(#loc34)) +#loc112 = loc("tmp13"(#loc35)) +#loc113 = loc("tmp14"(#loc36)) +#loc114 = loc("tmp14"(#loc37)) +#loc115 = loc("tmp14"(#loc38)) +#loc116 = loc("tmp16"(#loc39)) +#loc117 = loc("tmp20"(#loc40)) +#loc118 = loc("tmp23"(#loc41)) +#loc119 = loc("tmp23"(#loc42)) +#loc120 = loc("tmp23"(#loc43)) +#loc121 = loc("tmp23"(#loc44)) +#loc122 = loc("tmp23"(#loc45)) +#loc123 = loc("tmp23"(#loc46)) +#loc124 = loc("tmp23"(#loc47)) +#loc125 = loc("tmp23"(#loc48)) +#loc126 = loc("tmp25"(#loc49)) +#loc127 = loc("tmp25"(#loc50)) +#loc128 = loc("tmp25"(#loc51)) +#loc129 = loc("tmp25"(#loc52)) +#loc130 = loc("tmp27"(#loc53)) +#loc131 = loc("tmp29"(#loc54)) +#loc132 = loc("tmp30"(#loc55)) +#loc133 = loc("tmp31"(#loc56)) +#loc134 = loc("tmp32"(#loc57)) +#loc135 = loc("tmp32"(#loc58)) +#loc136 = loc("tmp32"(#loc59)) +#loc137 = loc("tmp34"(#loc60)) +#loc138 = loc("tmp37"(#loc61)) +#loc139 = loc("tmp38"(#loc62)) +#loc140 = loc("tmp19"(#loc63)) +#loc141 = loc(fused[#loc139, #loc140]) diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2e1bac7342609eb268e778dbdb3f0f27ac5b4f0b --- /dev/null +++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir @@ -0,0 +1,256 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("in_ptr1"(#loc)) +#loc74 = loc("in_ptr2"(#loc)) +#loc75 = loc("in_ptr3"(#loc)) +#loc76 = loc("in_ptr4"(#loc)) +#loc77 = loc("in_ptr5"(#loc)) +#loc78 = loc("out_ptr0"(#loc)) +#loc79 = loc("ynumel"(#loc)) +#loc80 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32x128xbf16> loc(#loc1) + %cst_0 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<9.99999997E-7> : tensor<32x128xf32> loc(#loc1) + %cst_2 = arith.constant dense<1.280000e+02> : tensor<32x128xf32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc1) + %cst_4 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc1) + %cst_5 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc1) + %cst_6 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc1) + %cst_7 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc1) + %xmask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc81) + %ymask = arith.constant dense<73728> : tensor<32x1xi32> loc(#loc82) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %yoffset = tt.get_program_id y : i32 loc(#loc83) + %yoffset_8 = tt.get_program_id z : i32 loc(#loc84) + %yoffset_9 = tt.get_num_programs y : i32 loc(#loc85) + %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc86) + %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc87) + %yoffset_12 = arith.muli %yoffset_11, %c32_i32 : i32 loc(#loc88) + %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc89) + %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc90) + %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<32x1xi32> loc(#loc91) + %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<32x1xi32> loc(#loc91) + %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<32x1xi32> loc(#loc82) + %xoffset = tt.get_program_id x : i32 loc(#loc92) + %xoffset_17 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc93) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc94) + %xindex_18 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc95) + %xindex_19 = tt.splat %xoffset_17 : i32 -> tensor<1x128xi32> loc(#loc96) + %xindex_20 = arith.addi %xindex_19, %xindex_18 : tensor<1x128xi32> loc(#loc96) + %xmask_21 = arith.cmpi slt, %xindex_20, %xmask : tensor<1x128xi32> loc(#loc81) + %y1 = arith.divsi %yindex_15, %cst_7 : tensor<32x1xi32> loc(#loc97) + %y0 = arith.remsi %yindex_15, %cst_7 : tensor<32x1xi32> loc(#loc98) + %tmp4 = arith.extsi %y1 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc99) + %tmp4_22 = arith.cmpi slt, %tmp4, %cst_6 : tensor<32x1xi64> loc(#loc99) + %tmp5 = arith.muli %y0, %cst_5 : tensor<32x1xi32> loc(#loc100) + %tmp5_23 = tt.broadcast %xindex_20 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc101) + %tmp5_24 = tt.broadcast %tmp5 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc101) + %tmp5_25 = arith.addi %tmp5_23, %tmp5_24 : tensor<32x128xi32> loc(#loc101) + %tmp5_26 = arith.muli %y1, %cst_4 : tensor<32x1xi32> loc(#loc102) + %tmp5_27 = tt.broadcast %tmp5_26 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc103) + %tmp5_28 = arith.addi %tmp5_25, %tmp5_27 : tensor<32x128xi32> loc(#loc103) + %tmp5_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x128x!tt.ptr> loc(#loc104) + %tmp5_30 = tt.addptr %tmp5_29, %tmp5_28 : tensor<32x128x!tt.ptr>, tensor<32x128xi32> loc(#loc104) + %tmp5_31 = tt.broadcast %tmp4_22 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc105) + %tmp5_32 = tt.broadcast %xmask_21 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc105) + %tmp5_33 = arith.andi %tmp5_31, %tmp5_32 : tensor<32x128xi1> loc(#loc105) + %tmp5_34 = tt.broadcast %ymask_16 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc106) + %tmp5_35 = arith.andi %tmp5_33, %tmp5_34 : tensor<32x128xi1> loc(#loc106) + %tmp5_36 = tt.load %tmp5_30, %tmp5_35, %cst evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc107) + %tmp5_37 = arith.extf %tmp5_36 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc108) + %tmp7 = arith.muli %y1, %cst_7 : tensor<32x1xi32> loc(#loc109) + %tmp7_38 = arith.addi %y0, %tmp7 : tensor<32x1xi32> loc(#loc110) + %tmp7_39 = tt.splat %in_ptr1 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc111) + %tmp7_40 = tt.addptr %tmp7_39, %tmp7_38 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc111) + %tmp7_41 = tt.broadcast %tmp7_40 : tensor<32x1x!tt.ptr> -> tensor<32x128x!tt.ptr> loc(#loc111) + %tmp7_42 = tt.load %tmp7_41, %tmp5_35, %cst_3 evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc112) + %tmp9 = arith.divf %tmp7_42, %cst_2 : tensor<32x128xf32> loc(#loc113) + %tmp11 = arith.addf %tmp9, %cst_1 : tensor<32x128xf32> loc(#loc114) + %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32>) -> tensor<32x128xf32> loc(#loc115) + %tmp13 = arith.mulf %tmp5_37, %tmp12 : tensor<32x128xf32> loc(#loc116) + %tmp14 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc117) + %tmp14_43 = tt.addptr %tmp14, %xindex_20 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc117) + %tmp14_44 = tt.broadcast %tmp14_43 : tensor<1x128x!tt.ptr> -> tensor<32x128x!tt.ptr> loc(#loc117) + %tmp14_45 = tt.load %tmp14_44, %tmp5_35, %cst evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc118) + %tmp14_46 = arith.extf %tmp14_45 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc119) + %tmp16 = arith.mulf %tmp13, %tmp14_46 : tensor<32x128xf32> loc(#loc120) + %tmp19 = arith.select %tmp5_31, %tmp16, %cst_3 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc121) + %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<32x1xi64> loc(#loc122) + %tmp23 = arith.addi %y1, %cst_0 : tensor<32x1xi32> loc(#loc123) + %tmp23_47 = arith.muli %tmp23, %cst_4 : tensor<32x1xi32> loc(#loc124) + %tmp23_48 = tt.broadcast %tmp23_47 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc125) + %tmp23_49 = arith.addi %tmp5_25, %tmp23_48 : tensor<32x128xi32> loc(#loc125) + %tmp23_50 = tt.splat %in_ptr3 : !tt.ptr -> tensor<32x128x!tt.ptr> loc(#loc126) + %tmp23_51 = tt.addptr %tmp23_50, %tmp23_49 : tensor<32x128x!tt.ptr>, tensor<32x128xi32> loc(#loc126) + %tmp23_52 = tt.broadcast %tmp20 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc127) + %tmp23_53 = arith.andi %tmp23_52, %tmp5_32 : tensor<32x128xi1> loc(#loc127) + %tmp23_54 = arith.andi %tmp23_53, %tmp5_34 : tensor<32x128xi1> loc(#loc128) + %tmp23_55 = tt.load %tmp23_51, %tmp23_54, %cst evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc129) + %tmp23_56 = arith.extf %tmp23_55 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc130) + %tmp25 = arith.muli %tmp23, %cst_7 : tensor<32x1xi32> loc(#loc131) + %tmp25_57 = arith.addi %y0, %tmp25 : tensor<32x1xi32> loc(#loc132) + %tmp25_58 = tt.splat %in_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc133) + %tmp25_59 = tt.addptr %tmp25_58, %tmp25_57 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc133) + %tmp25_60 = tt.broadcast %tmp25_59 : tensor<32x1x!tt.ptr> -> tensor<32x128x!tt.ptr> loc(#loc133) + %tmp25_61 = tt.load %tmp25_60, %tmp23_54, %cst_3 evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc134) + %tmp27 = arith.divf %tmp25_61, %cst_2 : tensor<32x128xf32> loc(#loc135) + %tmp29 = arith.addf %tmp27, %cst_1 : tensor<32x128xf32> loc(#loc136) + %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32>) -> tensor<32x128xf32> loc(#loc137) + %tmp31 = arith.mulf %tmp23_56, %tmp30 : tensor<32x128xf32> loc(#loc138) + %tmp32 = tt.splat %in_ptr5 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc139) + %tmp32_62 = tt.addptr %tmp32, %xindex_20 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc139) + %tmp32_63 = tt.broadcast %tmp32_62 : tensor<1x128x!tt.ptr> -> tensor<32x128x!tt.ptr> loc(#loc139) + %tmp32_64 = tt.load %tmp32_63, %tmp23_54, %cst evictionPolicy = evict_last : tensor<32x128x!tt.ptr> loc(#loc140) + %tmp32_65 = arith.extf %tmp32_64 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc141) + %tmp34 = arith.mulf %tmp31, %tmp32_65 : tensor<32x128xf32> loc(#loc142) + %tmp37 = arith.select %tmp23_52, %tmp34, %cst_3 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc143) + %tmp38 = arith.select %tmp5_31, %tmp19, %tmp37 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc144) + %0 = arith.muli %yindex_15, %cst_5 : tensor<32x1xi32> loc(#loc66) + %1 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc67) + %2 = arith.addi %tmp5_23, %1 : tensor<32x128xi32> loc(#loc67) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<32x128x!tt.ptr> loc(#loc68) + %4 = tt.addptr %3, %2 : tensor<32x128x!tt.ptr>, tensor<32x128xi32> loc(#loc68) + %5 = arith.andi %tmp5_32, %tmp5_34 : tensor<32x128xi1> loc(#loc69) + %6 = arith.truncf %tmp38 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc70) + tt.store %4, %6, %5 : tensor<32x128x!tt.ptr> loc(#loc70) + tt.return loc(#loc71) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4) +#loc81 = loc("xmask"(#loc2)) +#loc82 = loc("ymask"(#loc3)) +#loc83 = loc("yoffset"(#loc4)) +#loc84 = loc("yoffset"(#loc5)) +#loc85 = loc("yoffset"(#loc6)) +#loc86 = loc("yoffset"(#loc7)) +#loc87 = loc("yoffset"(#loc8)) +#loc88 = loc("yoffset"(#loc9)) +#loc89 = loc("yindex"(#loc10)) +#loc90 = loc("yindex"(#loc11)) +#loc91 = loc("yindex"(#loc12)) +#loc92 = loc("xoffset"(#loc13)) +#loc93 = loc("xoffset"(#loc14)) +#loc94 = loc("xindex"(#loc15)) +#loc95 = loc("xindex"(#loc16)) +#loc96 = loc("xindex"(#loc17)) +#loc97 = loc("y1"(#loc18)) +#loc98 = loc("y0"(#loc19)) +#loc99 = loc("tmp4"(#loc20)) +#loc100 = loc("tmp5"(#loc21)) +#loc101 = loc("tmp5"(#loc22)) +#loc102 = loc("tmp5"(#loc23)) +#loc103 = loc("tmp5"(#loc24)) +#loc104 = loc("tmp5"(#loc25)) +#loc105 = loc("tmp5"(#loc26)) +#loc106 = loc("tmp5"(#loc27)) +#loc107 = loc("tmp5"(#loc28)) +#loc108 = loc("tmp5"(#loc29)) +#loc109 = loc("tmp7"(#loc30)) +#loc110 = loc("tmp7"(#loc31)) +#loc111 = loc("tmp7"(#loc32)) +#loc112 = loc("tmp7"(#loc33)) +#loc113 = loc("tmp9"(#loc34)) +#loc114 = loc("tmp11"(#loc35)) +#loc115 = loc("tmp12"(#loc36)) +#loc116 = loc("tmp13"(#loc37)) +#loc117 = loc("tmp14"(#loc38)) +#loc118 = loc("tmp14"(#loc39)) +#loc119 = loc("tmp14"(#loc40)) +#loc120 = loc("tmp16"(#loc41)) +#loc121 = loc("tmp19"(#loc42)) +#loc122 = loc("tmp20"(#loc43)) +#loc123 = loc("tmp23"(#loc44)) +#loc124 = loc("tmp23"(#loc45)) +#loc125 = loc("tmp23"(#loc46)) +#loc126 = loc("tmp23"(#loc47)) +#loc127 = loc("tmp23"(#loc48)) +#loc128 = loc("tmp23"(#loc49)) +#loc129 = loc("tmp23"(#loc50)) +#loc130 = loc("tmp23"(#loc51)) +#loc131 = loc("tmp25"(#loc52)) +#loc132 = loc("tmp25"(#loc53)) +#loc133 = loc("tmp25"(#loc54)) +#loc134 = loc("tmp25"(#loc55)) +#loc135 = loc("tmp27"(#loc56)) +#loc136 = loc("tmp29"(#loc57)) +#loc137 = loc("tmp30"(#loc58)) +#loc138 = loc("tmp31"(#loc59)) +#loc139 = loc("tmp32"(#loc60)) +#loc140 = loc("tmp32"(#loc61)) +#loc141 = loc("tmp32"(#loc62)) +#loc142 = loc("tmp34"(#loc63)) +#loc143 = loc("tmp37"(#loc64)) +#loc144 = loc("tmp38"(#loc65)) diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/__grp__triton_poi_fused_add_mul_1.json b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/__grp__triton_poi_fused_add_mul_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3189810da0d1a4d08562ec50104228dfd33d02e4 --- /dev/null +++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/__grp__triton_poi_fused_add_mul_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_mul_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.source", "triton_poi_fused_add_mul_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttir", "triton_poi_fused_add_mul_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttgir", "triton_poi_fused_add_mul_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.llir", "triton_poi_fused_add_mul_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ptx", "triton_poi_fused_add_mul_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.cubin", "triton_poi_fused_add_mul_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.json"}} \ No newline at end of file diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.cubin b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..6dd0e14859961e4591d128a27b5be77b8d7eeb03 Binary files /dev/null and b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.cubin differ diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.json b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e74dd9bda134fba4e35c667f1e5c4090d8003e18 --- /dev/null +++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.json @@ -0,0 +1 @@ +{"hash": "b0864c125a209c83b5f156b3417dc442d9a6fdccf909d8b90fef13add9239ee6", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_1"} \ No newline at end of file diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.llir b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..f91df25b442f7de9dc8e952508a3a0c8a197e308 --- /dev/null +++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.llir @@ -0,0 +1,76 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_add_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 9, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = shl nuw nsw i32 %10, 1, !dbg !9 + %12 = and i32 %11, 510, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = srem i32 %13, 4096, !dbg !11 + %15 = sext i32 %13 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12 + %17 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %16) #2, !dbg !13 + %18 = bitcast i32 %17 to <2 x bfloat>, !dbg !13 + %19 = sext i32 %14 to i64, !dbg !14 + %20 = getelementptr bfloat, ptr addrspace(1) %1, i64 %19, !dbg !14 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15 + %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l"(ptr addrspace(1) %20, i64 %21) #2, !dbg !15 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !15 + %24 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16 + %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %24) #2, !dbg !17 + %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17 + %27 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18 + %28 = fpext <2 x bfloat> %18 to <2 x float>, !dbg !19 + %29 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !20 + %30 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !21 + %31 = fmul <2 x float> %29, %30, !dbg !22 + %32 = fadd <2 x float> %31, %28, !dbg !23 + %33 = fptrunc <2 x float> %32 to <2 x bfloat>, !dbg !24 + %34 = bitcast <2 x bfloat> %33 to i32, !dbg !24 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %34, ptr addrspace(1) %27) #2, !dbg !24 + ret void, !dbg !25 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_1", linkageName: "triton_poi_fused_add_mul_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 19, scope: !4) +!12 = !DILocation(line: 25, column: 30, scope: !4) +!13 = !DILocation(line: 25, column: 35, scope: !4) +!14 = !DILocation(line: 26, column: 30, scope: !4) +!15 = !DILocation(line: 26, column: 35, scope: !4) +!16 = !DILocation(line: 27, column: 30, scope: !4) +!17 = !DILocation(line: 27, column: 35, scope: !4) +!18 = !DILocation(line: 30, column: 25, scope: !4) +!19 = !DILocation(line: 25, column: 44, scope: !4) +!20 = !DILocation(line: 26, column: 74, scope: !4) +!21 = !DILocation(line: 27, column: 44, scope: !4) +!22 = !DILocation(line: 28, column: 18, scope: !4) +!23 = !DILocation(line: 29, column: 18, scope: !4) +!24 = !DILocation(line: 30, column: 36, scope: !4) +!25 = !DILocation(line: 30, column: 4, scope: !4) diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ptx b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..bcb290e97f0ae9090d44db7171a261314c410328 --- /dev/null +++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ptx @@ -0,0 +1,347 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_add_mul_1 // -- Begin function triton_poi_fused_add_mul_1 + // @triton_poi_fused_add_mul_1 +.visible .entry triton_poi_fused_add_mul_1( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_3, + .param .u32 triton_poi_fused_add_mul_1_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_6 +) +.reqntid 256 +{ + .reg .b16 %rs<7>; + .reg .b32 %r<24>; + .reg .b64 %rd<11>; + .loc 1 18 0 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:18:0 + +// %bb.0: + ld.param.b64 %rd6, [triton_poi_fused_add_mul_1_param_0]; + ld.param.b64 %rd7, [triton_poi_fused_add_mul_1_param_1]; +$L__tmp0: + .loc 1 20 28 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:20:28 + mov.u32 %r5, %ctaid.x; + .loc 1 20 33 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:20:33 + shl.b32 %r6, %r5, 9; + ld.param.b64 %rd8, [triton_poi_fused_add_mul_1_param_2]; + ld.param.b64 %rd9, [triton_poi_fused_add_mul_1_param_3]; + .loc 1 21 36 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:21:36 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 1; + and.b32 %r9, %r8, 510; + .loc 1 21 23 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:21:23 + or.b32 %r10, %r9, %r6; + .loc 1 24 19 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:24:19 + bfe.s32 %r11, %r5, 22, 1; + shr.u32 %r12, %r11, 20; + add.s32 %r13, %r10, %r12; + and.b32 %r14, %r13, -4096; + sub.s32 %r15, %r10, %r14; + .loc 1 25 30 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:30 + mul.wide.s32 %rd10, %r10, 2; + add.s64 %rd1, %rd6, %rd10; + .loc 1 25 35 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:35 + // begin inline asm + mov.u32 %r1, 0x0; + ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 26 30 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:30 + mad.wide.s32 %rd2, %r15, 2, %rd7; + .loc 1 26 35 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:35 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + ld.global.L1::evict_last.L2::cache_hint.b32 { %r2 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 27 30 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:30 + add.s64 %rd4, %rd8, %rd10; + .loc 1 27 35 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:35 + // begin inline asm + mov.u32 %r3, 0x0; + ld.global.b32 { %r3 }, [ %rd4 + 0 ]; + // end inline asm + .loc 1 30 25 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:25 + add.s64 %rd5, %rd9, %rd10; + .loc 1 25 44 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:44 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r16, %rs2; + cvt.f32.bf16 %r17, %rs1; + .loc 1 26 74 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:74 + mov.b32 {%rs3, %rs4}, %r2; + cvt.f32.bf16 %r18, %rs4; + cvt.f32.bf16 %r19, %rs3; + .loc 1 27 44 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:44 + mov.b32 {%rs5, %rs6}, %r3; + cvt.f32.bf16 %r20, %rs6; + cvt.f32.bf16 %r21, %rs5; + .loc 1 29 18 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:29:18 + fma.rn.f32 %r22, %r19, %r21, %r17; + fma.rn.f32 %r23, %r18, %r20, %r16; + .loc 1 30 36 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:36 + cvt.rn.bf16x2.f32 %r4, %r23, %r22; + // begin inline asm + st.global.b32 [ %rd5 + 0 ], { %r4 }; + // end inline asm + .loc 1 30 4 // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 55 +.b8 102 +.b8 102 +.b8 52 +.b8 105 +.b8 98 +.b8 54 +.b8 54 +.b8 53 +.b8 50 +.b8 111 +.b8 106 +.b8 108 +.b8 108 +.b8 117 +.b8 116 +.b8 109 +.b8 52 +.b8 99 +.b8 55 +.b8 109 +.b8 107 +.b8 122 +.b8 122 +.b8 112 +.b8 121 +.b8 98 +.b8 111 +.b8 110 +.b8 100 +.b8 51 +.b8 112 +.b8 97 +.b8 103 +.b8 117 +.b8 51 +.b8 103 +.b8 108 +.b8 115 +.b8 112 +.b8 119 +.b8 51 +.b8 115 +.b8 122 +.b8 116 +.b8 107 +.b8 102 +.b8 101 +.b8 50 +.b8 122 +.b8 97 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 55 +.b8 102 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.source b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.source new file mode 100644 index 0000000000000000000000000000000000000000..b1626afe5c2f197738fbbe8f4779fff6f72eac1d --- /dev/null +++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.source @@ -0,0 +1,82 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0) +#loc22 = loc("in_ptr0"(#loc)) +#loc23 = loc("in_ptr1"(#loc)) +#loc24 = loc("in_ptr2"(#loc)) +#loc25 = loc("out_ptr0"(#loc)) +#loc26 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 8388608 : i32 loc(#loc27) + %xoffset = tt.get_program_id x : i32 loc(#loc28) + %xoffset_1 = arith.constant 512 : i32 loc(#loc29) + %xoffset_2 = arith.constant 512 : i32 loc(#loc29) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc30) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc31) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc31) + %xmask = arith.constant true loc(#loc32) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc32) + %x0 = arith.constant 4096 : i32 loc(#loc33) + %x0_7 = arith.constant 4096 : i32 loc(#loc33) + %x0_8 = arith.constant dense<4096> : tensor<512xi32> loc(#loc33) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc33) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc34) + %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc34) + %tmp0_11 = tt.load %tmp0_10 : tensor<512x!tt.ptr> loc(#loc35) + %tmp0_12 = arith.extf %tmp0_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc36) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc37) + %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc37) + %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc38) + %tmp1_15 = arith.extf %tmp1_14 : tensor<512xbf16> to tensor<512xf32> loc(#loc39) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc40) + %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc40) + %tmp2_17 = tt.load %tmp2_16 : tensor<512x!tt.ptr> loc(#loc41) + %tmp2_18 = arith.extf %tmp2_17 : tensor<512xbf16> to tensor<512xf32> loc(#loc42) + %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<512xf32> loc(#loc43) + %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<512xf32> loc(#loc44) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc19) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc19) + %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc20) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc20) + tt.return loc(#loc21) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4) +#loc27 = loc("xnumel"(#loc1)) +#loc28 = loc("xoffset"(#loc2)) +#loc29 = loc("xoffset"(#loc3)) +#loc30 = loc("xindex"(#loc4)) +#loc31 = loc("xindex"(#loc5)) +#loc32 = loc("xmask"(#loc6)) +#loc33 = loc("x0"(#loc7)) +#loc34 = loc("tmp0"(#loc8)) +#loc35 = loc("tmp0"(#loc9)) +#loc36 = loc("tmp0"(#loc10)) +#loc37 = loc("tmp1"(#loc11)) +#loc38 = loc("tmp1"(#loc12)) +#loc39 = loc("tmp1"(#loc13)) +#loc40 = loc("tmp2"(#loc14)) +#loc41 = loc("tmp2"(#loc15)) +#loc42 = loc("tmp2"(#loc16)) +#loc43 = loc("tmp3"(#loc17)) +#loc44 = loc("tmp4"(#loc18)) diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttgir b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..436155cf7787907392c12b3a4a4322da3c4c0914 --- /dev/null +++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttgir @@ -0,0 +1,74 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc26) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc27) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc28) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32, #blocked> loc(#loc29) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32, #blocked> loc(#loc29) + %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32, #blocked> loc(#loc30) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc31) + %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc31) + %tmp0_4 = tt.load %tmp0_3 : tensor<512x!tt.ptr, #blocked> loc(#loc32) + %tmp0_5 = arith.extf %tmp0_4 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc34) + %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc34) + %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc35) + %tmp1_8 = arith.extf %tmp1_7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc37) + %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc37) + %tmp2_10 = tt.load %tmp2_9 : tensor<512x!tt.ptr, #blocked> loc(#loc38) + %tmp2_11 = arith.extf %tmp2_10 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc39) + %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<512xf32, #blocked> loc(#loc40) + %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<512xf32, #blocked> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc19) + tt.store %1, %2 : tensor<512x!tt.ptr, #blocked> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4) +#loc26 = loc("xoffset"(#loc2)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xindex"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("x0"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttir b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..3ebe3ece72fb3c5354ff98cef047a82986069276 --- /dev/null +++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttir @@ -0,0 +1,73 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %x0 = arith.constant dense<4096> : tensor<512xi32> loc(#loc26) + %c512_i32 = arith.constant 512 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc27) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc28) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc29) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc30) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc30) + %x0_3 = arith.remsi %xindex_2, %x0 : tensor<512xi32> loc(#loc26) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc31) + %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc31) + %tmp0_5 = tt.load %tmp0_4 : tensor<512x!tt.ptr> loc(#loc32) + %tmp0_6 = arith.extf %tmp0_5 : tensor<512xbf16> to tensor<512xf32> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc34) + %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc34) + %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc35) + %tmp1_9 = arith.extf %tmp1_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc37) + %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc37) + %tmp2_11 = tt.load %tmp2_10 : tensor<512x!tt.ptr> loc(#loc38) + %tmp2_12 = arith.extf %tmp2_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc39) + %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<512xf32> loc(#loc40) + %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<512xf32> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc19) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19) +#loc2 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4) +#loc26 = loc("x0"(#loc1)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xoffset"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("xindex"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/__grp__triton_poi_fused_add_mul_0.json b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/__grp__triton_poi_fused_add_mul_0.json new file mode 100644 index 0000000000000000000000000000000000000000..adbac3ad18d8fc51d1ebbf8e6eaa7c44e3441c1b --- /dev/null +++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/__grp__triton_poi_fused_add_mul_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_mul_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.source", "triton_poi_fused_add_mul_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttir", "triton_poi_fused_add_mul_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttgir", "triton_poi_fused_add_mul_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.llir", "triton_poi_fused_add_mul_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ptx", "triton_poi_fused_add_mul_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.cubin", "triton_poi_fused_add_mul_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.json"}} \ No newline at end of file diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.cubin b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..68099c11f78c7b0646c1d5eb47549b05fb4183ae Binary files /dev/null and b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.cubin differ diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.json b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.json new file mode 100644 index 0000000000000000000000000000000000000000..89724be249fdd454fe11a667e8f58971359a3253 --- /dev/null +++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.json @@ -0,0 +1 @@ +{"hash": "b25801ca8239072139760a6056ec916d8de364dfd19b5ff2e3a1a12928f02384", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_0"} \ No newline at end of file diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.llir b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..ff716b11651253c08f2152f2e99adadf6bac7f64 --- /dev/null +++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.llir @@ -0,0 +1,118 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_add_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 10, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = shl nuw nsw i32 %10, 3, !dbg !9 + %12 = and i32 %11, 1016, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = srem i32 %13, 4096, !dbg !11 + %15 = sext i32 %13 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12 + %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %16) #2, !dbg !13 + %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !13 + %19 = bitcast i32 %18 to <2 x bfloat>, !dbg !13 + %20 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !13 + %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !13 + %22 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !13 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !13 + %24 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !13 + %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !13 + %26 = sext i32 %14 to i64, !dbg !14 + %27 = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !14 + %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15 + %29 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %27, i64 %28) #2, !dbg !15 + %30 = extractvalue { i32, i32, i32, i32 } %29, 0, !dbg !15 + %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !15 + %32 = extractvalue { i32, i32, i32, i32 } %29, 1, !dbg !15 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !15 + %34 = extractvalue { i32, i32, i32, i32 } %29, 2, !dbg !15 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !15 + %36 = extractvalue { i32, i32, i32, i32 } %29, 3, !dbg !15 + %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !15 + %38 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16 + %39 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %38) #2, !dbg !17 + %40 = extractvalue { i32, i32, i32, i32 } %39, 0, !dbg !17 + %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !17 + %42 = extractvalue { i32, i32, i32, i32 } %39, 1, !dbg !17 + %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !17 + %44 = extractvalue { i32, i32, i32, i32 } %39, 2, !dbg !17 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !17 + %46 = extractvalue { i32, i32, i32, i32 } %39, 3, !dbg !17 + %47 = bitcast i32 %46 to <2 x bfloat>, !dbg !17 + %48 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18 + %49 = fpext <2 x bfloat> %19 to <2 x float>, !dbg !19 + %50 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !20 + %51 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !21 + %52 = fmul <2 x float> %50, %51, !dbg !22 + %53 = fadd <2 x float> %52, %49, !dbg !23 + %54 = fptrunc <2 x float> %53 to <2 x bfloat>, !dbg !24 + %55 = fpext <2 x bfloat> %21 to <2 x float>, !dbg !19 + %56 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !20 + %57 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !21 + %58 = fmul <2 x float> %56, %57, !dbg !22 + %59 = fadd <2 x float> %58, %55, !dbg !23 + %60 = fptrunc <2 x float> %59 to <2 x bfloat>, !dbg !24 + %61 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !19 + %62 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !20 + %63 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !21 + %64 = fmul <2 x float> %62, %63, !dbg !22 + %65 = fadd <2 x float> %64, %61, !dbg !23 + %66 = fptrunc <2 x float> %65 to <2 x bfloat>, !dbg !24 + %67 = fpext <2 x bfloat> %25 to <2 x float>, !dbg !19 + %68 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !20 + %69 = fpext <2 x bfloat> %47 to <2 x float>, !dbg !21 + %70 = fmul <2 x float> %68, %69, !dbg !22 + %71 = fadd <2 x float> %70, %67, !dbg !23 + %72 = fptrunc <2 x float> %71 to <2 x bfloat>, !dbg !24 + %73 = bitcast <2 x bfloat> %54 to i32, !dbg !24 + %74 = bitcast <2 x bfloat> %60 to i32, !dbg !24 + %75 = bitcast <2 x bfloat> %66 to i32, !dbg !24 + %76 = bitcast <2 x bfloat> %72 to i32, !dbg !24 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %73, i32 %74, i32 %75, i32 %76, ptr addrspace(1) %48) #2, !dbg !24 + ret void, !dbg !25 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_0", linkageName: "triton_poi_fused_add_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 19, scope: !4) +!12 = !DILocation(line: 25, column: 30, scope: !4) +!13 = !DILocation(line: 25, column: 35, scope: !4) +!14 = !DILocation(line: 26, column: 30, scope: !4) +!15 = !DILocation(line: 26, column: 35, scope: !4) +!16 = !DILocation(line: 27, column: 30, scope: !4) +!17 = !DILocation(line: 27, column: 35, scope: !4) +!18 = !DILocation(line: 30, column: 25, scope: !4) +!19 = !DILocation(line: 25, column: 44, scope: !4) +!20 = !DILocation(line: 26, column: 74, scope: !4) +!21 = !DILocation(line: 27, column: 44, scope: !4) +!22 = !DILocation(line: 28, column: 18, scope: !4) +!23 = !DILocation(line: 29, column: 18, scope: !4) +!24 = !DILocation(line: 30, column: 36, scope: !4) +!25 = !DILocation(line: 30, column: 4, scope: !4) diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ptx b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..44954ddf77b9b7d045f1e8712658a03bca88f56f --- /dev/null +++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ptx @@ -0,0 +1,407 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_add_mul_0 // -- Begin function triton_poi_fused_add_mul_0 + // @triton_poi_fused_add_mul_0 +.visible .entry triton_poi_fused_add_mul_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_3, + .param .u32 triton_poi_fused_add_mul_0_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_6 +) +.reqntid 128 +{ + .reg .b16 %rs<25>; + .reg .b32 %r<60>; + .reg .b64 %rd<11>; + .loc 1 18 0 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:18:0 + +// %bb.0: + ld.param.b64 %rd6, [triton_poi_fused_add_mul_0_param_0]; + ld.param.b64 %rd7, [triton_poi_fused_add_mul_0_param_1]; +$L__tmp0: + .loc 1 20 28 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:20:28 + mov.u32 %r17, %ctaid.x; + .loc 1 20 33 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:20:33 + shl.b32 %r18, %r17, 10; + ld.param.b64 %rd8, [triton_poi_fused_add_mul_0_param_2]; + ld.param.b64 %rd9, [triton_poi_fused_add_mul_0_param_3]; + .loc 1 21 36 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:21:36 + mov.u32 %r19, %tid.x; + shl.b32 %r20, %r19, 3; + and.b32 %r21, %r20, 1016; + .loc 1 21 23 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:21:23 + or.b32 %r22, %r21, %r18; + .loc 1 24 19 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:24:19 + bfe.s32 %r23, %r17, 21, 1; + shr.u32 %r24, %r23, 20; + add.s32 %r25, %r22, %r24; + and.b32 %r26, %r25, -4096; + sub.s32 %r27, %r22, %r26; + .loc 1 25 30 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:30 + mul.wide.s32 %rd10, %r22, 2; + add.s64 %rd1, %rd6, %rd10; + .loc 1 25 35 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:35 + // begin inline asm + mov.u32 %r1, 0x0; + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 26 30 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:30 + mad.wide.s32 %rd2, %r27, 2, %rd7; + .loc 1 26 35 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:35 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r5, 0x0; + mov.u32 %r6, 0x0; + mov.u32 %r7, 0x0; + mov.u32 %r8, 0x0; + ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 27 30 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:30 + add.s64 %rd4, %rd8, %rd10; + .loc 1 27 35 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:35 + // begin inline asm + mov.u32 %r9, 0x0; + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + ld.global.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd4 + 0 ]; + // end inline asm + .loc 1 30 25 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:25 + add.s64 %rd5, %rd9, %rd10; + .loc 1 25 44 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:44 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r28, %rs2; + cvt.f32.bf16 %r29, %rs1; + .loc 1 26 74 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:74 + mov.b32 {%rs3, %rs4}, %r5; + cvt.f32.bf16 %r30, %rs4; + cvt.f32.bf16 %r31, %rs3; + .loc 1 27 44 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:44 + mov.b32 {%rs5, %rs6}, %r9; + cvt.f32.bf16 %r32, %rs6; + cvt.f32.bf16 %r33, %rs5; + .loc 1 29 18 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:29:18 + fma.rn.f32 %r34, %r31, %r33, %r29; + fma.rn.f32 %r35, %r30, %r32, %r28; + .loc 1 30 36 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:36 + cvt.rn.bf16x2.f32 %r13, %r35, %r34; + .loc 1 25 44 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:44 + mov.b32 {%rs7, %rs8}, %r2; + cvt.f32.bf16 %r36, %rs8; + cvt.f32.bf16 %r37, %rs7; + .loc 1 26 74 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:74 + mov.b32 {%rs9, %rs10}, %r6; + cvt.f32.bf16 %r38, %rs10; + cvt.f32.bf16 %r39, %rs9; + .loc 1 27 44 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:44 + mov.b32 {%rs11, %rs12}, %r10; + cvt.f32.bf16 %r40, %rs12; + cvt.f32.bf16 %r41, %rs11; + .loc 1 29 18 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:29:18 + fma.rn.f32 %r42, %r39, %r41, %r37; + fma.rn.f32 %r43, %r38, %r40, %r36; + .loc 1 30 36 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:36 + cvt.rn.bf16x2.f32 %r14, %r43, %r42; + .loc 1 25 44 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:44 + mov.b32 {%rs13, %rs14}, %r3; + cvt.f32.bf16 %r44, %rs14; + cvt.f32.bf16 %r45, %rs13; + .loc 1 26 74 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:74 + mov.b32 {%rs15, %rs16}, %r7; + cvt.f32.bf16 %r46, %rs16; + cvt.f32.bf16 %r47, %rs15; + .loc 1 27 44 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:44 + mov.b32 {%rs17, %rs18}, %r11; + cvt.f32.bf16 %r48, %rs18; + cvt.f32.bf16 %r49, %rs17; + .loc 1 29 18 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:29:18 + fma.rn.f32 %r50, %r47, %r49, %r45; + fma.rn.f32 %r51, %r46, %r48, %r44; + .loc 1 30 36 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:36 + cvt.rn.bf16x2.f32 %r15, %r51, %r50; + .loc 1 25 44 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:44 + mov.b32 {%rs19, %rs20}, %r4; + cvt.f32.bf16 %r52, %rs20; + cvt.f32.bf16 %r53, %rs19; + .loc 1 26 74 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:74 + mov.b32 {%rs21, %rs22}, %r8; + cvt.f32.bf16 %r54, %rs22; + cvt.f32.bf16 %r55, %rs21; + .loc 1 27 44 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:44 + mov.b32 {%rs23, %rs24}, %r12; + cvt.f32.bf16 %r56, %rs24; + cvt.f32.bf16 %r57, %rs23; + .loc 1 29 18 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:29:18 + fma.rn.f32 %r58, %r55, %r57, %r53; + fma.rn.f32 %r59, %r54, %r56, %r52; + .loc 1 30 36 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:36 + cvt.rn.bf16x2.f32 %r16, %r59, %r58; + // begin inline asm + st.global.v4.b32 [ %rd5 + 0 ], { %r13, %r14, %r15, %r16 }; + // end inline asm + .loc 1 30 4 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 120 +.b8 106 +.b8 52 +.b8 112 +.b8 53 +.b8 51 +.b8 104 +.b8 111 +.b8 116 +.b8 118 +.b8 119 +.b8 51 +.b8 51 +.b8 54 +.b8 119 +.b8 52 +.b8 106 +.b8 54 +.b8 106 +.b8 54 +.b8 110 +.b8 108 +.b8 121 +.b8 100 +.b8 119 +.b8 120 +.b8 122 +.b8 114 +.b8 115 +.b8 52 +.b8 104 +.b8 104 +.b8 107 +.b8 106 +.b8 52 +.b8 50 +.b8 104 +.b8 111 +.b8 102 +.b8 108 +.b8 111 +.b8 116 +.b8 50 +.b8 110 +.b8 115 +.b8 122 +.b8 113 +.b8 122 +.b8 113 +.b8 51 +.b8 117 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 120 +.b8 106 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.source b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.source new file mode 100644 index 0000000000000000000000000000000000000000..9540e6aec96ba1c2af2514155a8ddd8b706766c5 --- /dev/null +++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.source @@ -0,0 +1,82 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0) +#loc22 = loc("in_ptr0"(#loc)) +#loc23 = loc("in_ptr1"(#loc)) +#loc24 = loc("in_ptr2"(#loc)) +#loc25 = loc("out_ptr0"(#loc)) +#loc26 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc27) + %xoffset = tt.get_program_id x : i32 loc(#loc28) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc29) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc29) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc30) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc31) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc31) + %xmask = arith.constant true loc(#loc32) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc32) + %x0 = arith.constant 4096 : i32 loc(#loc33) + %x0_7 = arith.constant 4096 : i32 loc(#loc33) + %x0_8 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc33) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc33) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc34) + %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc34) + %tmp0_11 = tt.load %tmp0_10 : tensor<1024x!tt.ptr> loc(#loc35) + %tmp0_12 = arith.extf %tmp0_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc37) + %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc37) + %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc38) + %tmp1_15 = arith.extf %tmp1_14 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc40) + %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc40) + %tmp2_17 = tt.load %tmp2_16 : tensor<1024x!tt.ptr> loc(#loc41) + %tmp2_18 = arith.extf %tmp2_17 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc42) + %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<1024xf32> loc(#loc43) + %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<1024xf32> loc(#loc44) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc19) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc19) + %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc20) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc20) + tt.return loc(#loc21) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4) +#loc27 = loc("xnumel"(#loc1)) +#loc28 = loc("xoffset"(#loc2)) +#loc29 = loc("xoffset"(#loc3)) +#loc30 = loc("xindex"(#loc4)) +#loc31 = loc("xindex"(#loc5)) +#loc32 = loc("xmask"(#loc6)) +#loc33 = loc("x0"(#loc7)) +#loc34 = loc("tmp0"(#loc8)) +#loc35 = loc("tmp0"(#loc9)) +#loc36 = loc("tmp0"(#loc10)) +#loc37 = loc("tmp1"(#loc11)) +#loc38 = loc("tmp1"(#loc12)) +#loc39 = loc("tmp1"(#loc13)) +#loc40 = loc("tmp2"(#loc14)) +#loc41 = loc("tmp2"(#loc15)) +#loc42 = loc("tmp2"(#loc16)) +#loc43 = loc("tmp3"(#loc17)) +#loc44 = loc("tmp4"(#loc18)) diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttgir b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..63e2f8fa8bf6aa63baaa5418d52ef014ce10516d --- /dev/null +++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttgir @@ -0,0 +1,74 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc26) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc27) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc28) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32, #blocked> loc(#loc29) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32, #blocked> loc(#loc29) + %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32, #blocked> loc(#loc30) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc31) + %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc31) + %tmp0_4 = tt.load %tmp0_3 : tensor<1024x!tt.ptr, #blocked> loc(#loc32) + %tmp0_5 = arith.extf %tmp0_4 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc34) + %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc34) + %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc35) + %tmp1_8 = arith.extf %tmp1_7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc37) + %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc37) + %tmp2_10 = tt.load %tmp2_9 : tensor<1024x!tt.ptr, #blocked> loc(#loc38) + %tmp2_11 = arith.extf %tmp2_10 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc39) + %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<1024xf32, #blocked> loc(#loc40) + %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<1024xf32, #blocked> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc19) + tt.store %1, %2 : tensor<1024x!tt.ptr, #blocked> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4) +#loc26 = loc("xoffset"(#loc2)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xindex"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("x0"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttir b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..fa71f8cc16761ebc1193fc79c01a58067123235e --- /dev/null +++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttir @@ -0,0 +1,73 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %x0 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc26) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc27) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc28) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc29) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc30) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc30) + %x0_3 = arith.remsi %xindex_2, %x0 : tensor<1024xi32> loc(#loc26) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc31) + %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc31) + %tmp0_5 = tt.load %tmp0_4 : tensor<1024x!tt.ptr> loc(#loc32) + %tmp0_6 = arith.extf %tmp0_5 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc34) + %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc34) + %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc35) + %tmp1_9 = arith.extf %tmp1_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc37) + %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc37) + %tmp2_11 = tt.load %tmp2_10 : tensor<1024x!tt.ptr> loc(#loc38) + %tmp2_12 = arith.extf %tmp2_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39) + %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<1024xf32> loc(#loc40) + %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<1024xf32> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19) +#loc2 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4) +#loc26 = loc("x0"(#loc1)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xoffset"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("xindex"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/__grp__triton_poi_fused_clone_permute_1.json b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/__grp__triton_poi_fused_clone_permute_1.json new file mode 100644 index 0000000000000000000000000000000000000000..304084d10ec2828903eadd2c674840d3469700a4 --- /dev/null +++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/__grp__triton_poi_fused_clone_permute_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_clone_permute_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.source", "triton_poi_fused_clone_permute_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttir", "triton_poi_fused_clone_permute_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttgir", "triton_poi_fused_clone_permute_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.llir", "triton_poi_fused_clone_permute_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ptx", "triton_poi_fused_clone_permute_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.cubin", "triton_poi_fused_clone_permute_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.json"}} \ No newline at end of file diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.cubin b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e4fb21bd4ba1dd64279e04556079967eca7c3a06 Binary files /dev/null and b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.cubin differ diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.json b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ff624dab70ae2155de636afc17772c34f7f2ddfa --- /dev/null +++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.json @@ -0,0 +1 @@ +{"hash": "b58096ccc76c88b9782fa5ae5908130e1b1cb1dbccc89f07b1572d968f5f2557", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_permute_1"} \ No newline at end of file diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.llir b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..de514b90391d11c66700ae9478eb24e360effc68 --- /dev/null +++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.llir @@ -0,0 +1,71 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_clone_permute_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 10, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 3, !dbg !9 + %10 = and i32 %9, 1016, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = sdiv i32 %11, 128, !dbg !11 + %13 = mul i32 %12, 128, !dbg !12 + %.decomposed = sub i32 %11, %13, !dbg !12 + %14 = srem i32 %12, 32, !dbg !13 + %15 = sdiv i32 %11, 4096, !dbg !14 + %16 = shl nsw i32 %15, 7, !dbg !15 + %17 = add nsw i32 %16, %.decomposed, !dbg !16 + %18 = mul nsw i32 %14, 294912, !dbg !17 + %19 = add nsw i32 %17, %18, !dbg !18 + %20 = sext i32 %19 to i64, !dbg !19 + %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19 + %22 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %21) #2, !dbg !20 + %23 = extractvalue { i32, i32, i32, i32 } %22, 0, !dbg !20 + %24 = extractvalue { i32, i32, i32, i32 } %22, 1, !dbg !20 + %25 = extractvalue { i32, i32, i32, i32 } %22, 2, !dbg !20 + %26 = extractvalue { i32, i32, i32, i32 } %22, 3, !dbg !20 + %27 = sext i32 %11 to i64, !dbg !21 + %28 = getelementptr bfloat, ptr addrspace(1) %1, i64 %27, !dbg !21 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %23, i32 %24, i32 %25, i32 %26, ptr addrspace(1) %28) #2, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_permute_1", linkageName: "triton_poi_fused_clone_permute_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 23, column: 19, scope: !4) +!13 = !DILocation(line: 24, column: 28, scope: !4) +!14 = !DILocation(line: 25, column: 19, scope: !4) +!15 = !DILocation(line: 27, column: 39, scope: !4) +!16 = !DILocation(line: 27, column: 35, scope: !4) +!17 = !DILocation(line: 27, column: 51, scope: !4) +!18 = !DILocation(line: 27, column: 44, scope: !4) +!19 = !DILocation(line: 27, column: 30, scope: !4) +!20 = !DILocation(line: 27, column: 56, scope: !4) +!21 = !DILocation(line: 28, column: 25, scope: !4) +!22 = !DILocation(line: 28, column: 36, scope: !4) +!23 = !DILocation(line: 28, column: 4, scope: !4) diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ptx b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..4460a694435f14748c57ff63fffe06869ac59cee --- /dev/null +++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ptx @@ -0,0 +1,327 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_clone_permute_1 // -- Begin function triton_poi_fused_clone_permute_1 + // @triton_poi_fused_clone_permute_1 +.visible .entry triton_poi_fused_clone_permute_1( + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_1, + .param .u32 triton_poi_fused_clone_permute_1_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_4 +) +.reqntid 128 +{ + .reg .b32 %r<27>; + .reg .b64 %rd<5>; + .loc 1 18 0 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_poi_fused_clone_permute_1_param_0]; + ld.param.b64 %rd4, [triton_poi_fused_clone_permute_1_param_1]; +$L__tmp0: + .loc 1 20 28 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:20:28 + mov.u32 %r5, %ctaid.x; + .loc 1 20 33 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:20:33 + shl.b32 %r6, %r5, 10; + .loc 1 21 36 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:21:36 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 3; + and.b32 %r9, %r8, 1016; + .loc 1 21 23 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:21:23 + or.b32 %r10, %r9, %r6; + .loc 1 24 21 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:24:21 + bfe.s32 %r11, %r5, 21, 1; + shr.u32 %r12, %r11, 25; + add.s32 %r13, %r10, %r12; + shr.s32 %r14, %r13, 7; + .loc 1 23 19 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:23:19 + and.b32 %r15, %r13, -128; + sub.s32 %r16, %r10, %r15; + .loc 1 24 28 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:24:28 + shr.u32 %r17, %r14, 27; + add.s32 %r18, %r14, %r17; + and.b32 %r19, %r18, 131040; + sub.s32 %r20, %r14, %r19; + .loc 1 25 19 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:25:19 + shr.u32 %r21, %r11, 20; + add.s32 %r22, %r10, %r21; + shr.s32 %r23, %r22, 12; + .loc 1 27 39 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:39 + shl.b32 %r24, %r23, 7; + .loc 1 27 35 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:35 + add.s32 %r25, %r24, %r16; + .loc 1 27 44 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:44 + mad.lo.s32 %r26, %r20, 294912, %r25; + .loc 1 27 30 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:30 + mad.wide.s32 %rd1, %r26, 2, %rd3; + .loc 1 27 56 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:56 + // begin inline asm + mov.u32 %r1, 0x0; + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 28 25 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:25 + mad.wide.s32 %rd2, %r10, 2, %rd4; + .loc 1 28 36 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:36 + // begin inline asm + st.global.v4.b32 [ %rd2 + 0 ], { %r1, %r2, %r3, %r4 }; + // end inline asm + .loc 1 28 4 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 114 +.b8 52 +.b8 99 +.b8 51 +.b8 114 +.b8 108 +.b8 98 +.b8 99 +.b8 54 +.b8 51 +.b8 106 +.b8 50 +.b8 113 +.b8 121 +.b8 111 +.b8 101 +.b8 51 +.b8 108 +.b8 54 +.b8 50 +.b8 109 +.b8 118 +.b8 98 +.b8 99 +.b8 114 +.b8 109 +.b8 116 +.b8 52 +.b8 120 +.b8 53 +.b8 103 +.b8 112 +.b8 100 +.b8 110 +.b8 50 +.b8 55 +.b8 100 +.b8 118 +.b8 112 +.b8 101 +.b8 110 +.b8 108 +.b8 99 +.b8 103 +.b8 116 +.b8 109 +.b8 116 +.b8 55 +.b8 52 +.b8 107 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 114 +.b8 52 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.source b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.source new file mode 100644 index 0000000000000000000000000000000000000000..8b82e37670979443529030dc38b4f14210659a70 --- /dev/null +++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.source @@ -0,0 +1,90 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("out_ptr0"(#loc)) +#loc23 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc26) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc26) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc28) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc28) + %xmask = arith.constant true loc(#loc29) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc29) + %x0 = arith.constant 128 : i32 loc(#loc30) + %x0_7 = arith.constant 128 : i32 loc(#loc30) + %x0_8 = arith.constant dense<128> : tensor<1024xi32> loc(#loc30) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc30) + %x1 = arith.constant 128 : i32 loc(#loc31) + %x1_10 = arith.constant 128 : i32 loc(#loc31) + %x1_11 = arith.constant dense<128> : tensor<1024xi32> loc(#loc31) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc31) + %x1_13 = arith.constant 32 : i32 loc(#loc32) + %x1_14 = arith.constant 32 : i32 loc(#loc32) + %x1_15 = arith.constant dense<32> : tensor<1024xi32> loc(#loc32) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1024xi32> loc(#loc32) + %x2 = arith.constant 4096 : i32 loc(#loc33) + %x2_17 = arith.constant 4096 : i32 loc(#loc33) + %x2_18 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc33) + %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<1024xi32> loc(#loc33) + %tmp0 = arith.constant 128 : i32 loc(#loc34) + %tmp0_20 = arith.constant 128 : i32 loc(#loc34) + %tmp0_21 = arith.constant dense<128> : tensor<1024xi32> loc(#loc34) + %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<1024xi32> loc(#loc34) + %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<1024xi32> loc(#loc35) + %tmp0_24 = arith.constant 294912 : i32 loc(#loc36) + %tmp0_25 = arith.constant 294912 : i32 loc(#loc36) + %tmp0_26 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc36) + %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<1024xi32> loc(#loc36) + %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<1024xi32> loc(#loc37) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc38) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc38) + %tmp0_31 = tt.load %tmp0_30 : tensor<1024x!tt.ptr> loc(#loc39) + %tmp0_32 = arith.extf %tmp0_31 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc40) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc18) + %2 = arith.truncf %tmp0_32 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:65) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4) +#loc24 = loc("xnumel"(#loc1)) +#loc25 = loc("xoffset"(#loc2)) +#loc26 = loc("xoffset"(#loc3)) +#loc27 = loc("xindex"(#loc4)) +#loc28 = loc("xindex"(#loc5)) +#loc29 = loc("xmask"(#loc6)) +#loc30 = loc("x0"(#loc7)) +#loc31 = loc("x1"(#loc8)) +#loc32 = loc("x1"(#loc9)) +#loc33 = loc("x2"(#loc10)) +#loc34 = loc("tmp0"(#loc11)) +#loc35 = loc("tmp0"(#loc12)) +#loc36 = loc("tmp0"(#loc13)) +#loc37 = loc("tmp0"(#loc14)) +#loc38 = loc("tmp0"(#loc15)) +#loc39 = loc("tmp0"(#loc16)) +#loc40 = loc("tmp0"(#loc17)) diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttgir b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..77c612c546d284b64ccc582e7d74da0c22a8602f --- /dev/null +++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttgir @@ -0,0 +1,66 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<294912> : tensor<1024xi32, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc22) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc23) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc24) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc25) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc25) + %x0 = arith.remsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc26) + %x1 = arith.divsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc27) + %x1_6 = arith.remsi %x1, %cst_0 : tensor<1024xi32, #blocked> loc(#loc28) + %x2 = arith.divsi %xindex_5, %cst_1 : tensor<1024xi32, #blocked> loc(#loc29) + %tmp0 = arith.muli %x2, %cst : tensor<1024xi32, #blocked> loc(#loc30) + %tmp0_7 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc31) + %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<1024xi32, #blocked> loc(#loc32) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32, #blocked> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr, #blocked> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc16) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr, #blocked> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4) +#loc22 = loc("xoffset"(#loc2)) +#loc23 = loc("xoffset"(#loc3)) +#loc24 = loc("xindex"(#loc4)) +#loc25 = loc("xindex"(#loc5)) +#loc26 = loc("x0"(#loc6)) +#loc27 = loc("x1"(#loc7)) +#loc28 = loc("x1"(#loc8)) +#loc29 = loc("x2"(#loc9)) +#loc30 = loc("tmp0"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttir b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..6ae9ccdfcf824b0acaa9121b4dc52364580189f6 --- /dev/null +++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttir @@ -0,0 +1,65 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc22) + %x2 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc23) + %x1 = arith.constant dense<32> : tensor<1024xi32> loc(#loc24) + %cst = arith.constant dense<128> : tensor<1024xi32> loc(#loc4) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc4) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc26) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc28) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc28) + %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32> loc(#loc29) + %x1_3 = arith.divsi %xindex_2, %cst : tensor<1024xi32> loc(#loc30) + %x1_4 = arith.remsi %x1_3, %x1 : tensor<1024xi32> loc(#loc24) + %x2_5 = arith.divsi %xindex_2, %x2 : tensor<1024xi32> loc(#loc23) + %tmp0_6 = arith.muli %x2_5, %cst : tensor<1024xi32> loc(#loc31) + %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<1024xi32> loc(#loc32) + %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<1024xi32> loc(#loc22) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc16) + %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28) +#loc4 = loc(unknown) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4) +#loc22 = loc("tmp0"(#loc1)) +#loc23 = loc("x2"(#loc2)) +#loc24 = loc("x1"(#loc3)) +#loc25 = loc("xoffset"(#loc5)) +#loc26 = loc("xoffset"(#loc6)) +#loc27 = loc("xindex"(#loc7)) +#loc28 = loc("xindex"(#loc8)) +#loc29 = loc("x0"(#loc9)) +#loc30 = loc("x1"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/__grp__triton_poi_fused_clone_permute_1.json b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/__grp__triton_poi_fused_clone_permute_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1bc54924737d501726f01bb9fd8fdd3951d1ca27 --- /dev/null +++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/__grp__triton_poi_fused_clone_permute_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_clone_permute_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.source", "triton_poi_fused_clone_permute_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttir", "triton_poi_fused_clone_permute_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttgir", "triton_poi_fused_clone_permute_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.llir", "triton_poi_fused_clone_permute_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ptx", "triton_poi_fused_clone_permute_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.cubin", "triton_poi_fused_clone_permute_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.json"}} \ No newline at end of file diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.cubin b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e0ea9623516a2806328d1fe07b2cb84ce5ae810f Binary files /dev/null and b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.cubin differ diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.json b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bd4e606c4ff8ace4220b6b8eae55216e47f9533b --- /dev/null +++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.json @@ -0,0 +1 @@ +{"hash": "bfb8576bfbee2023d20438237604e2d39df47eddc6b149a5df6c548c361dfe81", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_permute_1"} \ No newline at end of file diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.llir b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..f72e59ae2b240fd1d56f3a6041af45c7aa7c26a2 --- /dev/null +++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.llir @@ -0,0 +1,67 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_clone_permute_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 9, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 1, !dbg !9 + %10 = and i32 %9, 510, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = sdiv i32 %11, 128, !dbg !11 + %13 = mul i32 %12, 128, !dbg !12 + %.decomposed = sub i32 %11, %13, !dbg !12 + %14 = srem i32 %12, 32, !dbg !13 + %15 = sdiv i32 %11, 4096, !dbg !14 + %16 = shl nsw i32 %15, 7, !dbg !15 + %17 = add nsw i32 %16, %.decomposed, !dbg !16 + %18 = mul nsw i32 %14, 294912, !dbg !17 + %19 = add nsw i32 %17, %18, !dbg !18 + %20 = sext i32 %19 to i64, !dbg !19 + %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19 + %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %21) #2, !dbg !20 + %23 = sext i32 %11 to i64, !dbg !21 + %24 = getelementptr bfloat, ptr addrspace(1) %1, i64 %23, !dbg !21 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %22, ptr addrspace(1) %24) #2, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_permute_1", linkageName: "triton_poi_fused_clone_permute_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 23, column: 19, scope: !4) +!13 = !DILocation(line: 24, column: 28, scope: !4) +!14 = !DILocation(line: 25, column: 19, scope: !4) +!15 = !DILocation(line: 27, column: 39, scope: !4) +!16 = !DILocation(line: 27, column: 35, scope: !4) +!17 = !DILocation(line: 27, column: 51, scope: !4) +!18 = !DILocation(line: 27, column: 44, scope: !4) +!19 = !DILocation(line: 27, column: 30, scope: !4) +!20 = !DILocation(line: 27, column: 56, scope: !4) +!21 = !DILocation(line: 28, column: 25, scope: !4) +!22 = !DILocation(line: 28, column: 36, scope: !4) +!23 = !DILocation(line: 28, column: 4, scope: !4) diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ptx b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..fbe0faf8e74f8ef93c602bdbf09045b2dd8a40ec --- /dev/null +++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ptx @@ -0,0 +1,324 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_clone_permute_1 // -- Begin function triton_poi_fused_clone_permute_1 + // @triton_poi_fused_clone_permute_1 +.visible .entry triton_poi_fused_clone_permute_1( + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_1, + .param .u32 triton_poi_fused_clone_permute_1_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_4 +) +.reqntid 256 +{ + .reg .b32 %r<24>; + .reg .b64 %rd<5>; + .loc 1 18 0 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_poi_fused_clone_permute_1_param_0]; + ld.param.b64 %rd4, [triton_poi_fused_clone_permute_1_param_1]; +$L__tmp0: + .loc 1 20 28 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:20:28 + mov.u32 %r2, %ctaid.x; + .loc 1 20 33 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:20:33 + shl.b32 %r3, %r2, 9; + .loc 1 21 36 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:21:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 21 23 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:21:23 + or.b32 %r7, %r6, %r3; + .loc 1 24 21 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:24:21 + bfe.s32 %r8, %r2, 22, 1; + shr.u32 %r9, %r8, 25; + add.s32 %r10, %r7, %r9; + shr.s32 %r11, %r10, 7; + .loc 1 23 19 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:23:19 + and.b32 %r12, %r10, -128; + sub.s32 %r13, %r7, %r12; + .loc 1 24 28 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:24:28 + shr.u32 %r14, %r11, 27; + add.s32 %r15, %r11, %r14; + and.b32 %r16, %r15, 131040; + sub.s32 %r17, %r11, %r16; + .loc 1 25 19 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:25:19 + shr.u32 %r18, %r8, 20; + add.s32 %r19, %r7, %r18; + shr.s32 %r20, %r19, 12; + .loc 1 27 39 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:39 + shl.b32 %r21, %r20, 7; + .loc 1 27 35 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:35 + add.s32 %r22, %r21, %r13; + .loc 1 27 44 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:44 + mad.lo.s32 %r23, %r17, 294912, %r22; + .loc 1 27 30 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:30 + mad.wide.s32 %rd1, %r23, 2, %rd3; + .loc 1 27 56 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:56 + // begin inline asm + mov.u32 %r1, 0x0; + ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 28 25 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:25 + mad.wide.s32 %rd2, %r7, 2, %rd4; + .loc 1 28 36 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:36 + // begin inline asm + st.global.b32 [ %rd2 + 0 ], { %r1 }; + // end inline asm + .loc 1 28 4 // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 114 +.b8 52 +.b8 99 +.b8 51 +.b8 114 +.b8 108 +.b8 98 +.b8 99 +.b8 54 +.b8 51 +.b8 106 +.b8 50 +.b8 113 +.b8 121 +.b8 111 +.b8 101 +.b8 51 +.b8 108 +.b8 54 +.b8 50 +.b8 109 +.b8 118 +.b8 98 +.b8 99 +.b8 114 +.b8 109 +.b8 116 +.b8 52 +.b8 120 +.b8 53 +.b8 103 +.b8 112 +.b8 100 +.b8 110 +.b8 50 +.b8 55 +.b8 100 +.b8 118 +.b8 112 +.b8 101 +.b8 110 +.b8 108 +.b8 99 +.b8 103 +.b8 116 +.b8 109 +.b8 116 +.b8 55 +.b8 52 +.b8 107 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 114 +.b8 52 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.source b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.source new file mode 100644 index 0000000000000000000000000000000000000000..8fd19f3ef7b5aaad73a872613ef77c41c55f73cd --- /dev/null +++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.source @@ -0,0 +1,90 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("out_ptr0"(#loc)) +#loc23 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_1 = arith.constant 512 : i32 loc(#loc26) + %xoffset_2 = arith.constant 512 : i32 loc(#loc26) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc28) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc28) + %xmask = arith.constant true loc(#loc29) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc29) + %x0 = arith.constant 128 : i32 loc(#loc30) + %x0_7 = arith.constant 128 : i32 loc(#loc30) + %x0_8 = arith.constant dense<128> : tensor<512xi32> loc(#loc30) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc30) + %x1 = arith.constant 128 : i32 loc(#loc31) + %x1_10 = arith.constant 128 : i32 loc(#loc31) + %x1_11 = arith.constant dense<128> : tensor<512xi32> loc(#loc31) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc31) + %x1_13 = arith.constant 32 : i32 loc(#loc32) + %x1_14 = arith.constant 32 : i32 loc(#loc32) + %x1_15 = arith.constant dense<32> : tensor<512xi32> loc(#loc32) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<512xi32> loc(#loc32) + %x2 = arith.constant 4096 : i32 loc(#loc33) + %x2_17 = arith.constant 4096 : i32 loc(#loc33) + %x2_18 = arith.constant dense<4096> : tensor<512xi32> loc(#loc33) + %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<512xi32> loc(#loc33) + %tmp0 = arith.constant 128 : i32 loc(#loc34) + %tmp0_20 = arith.constant 128 : i32 loc(#loc34) + %tmp0_21 = arith.constant dense<128> : tensor<512xi32> loc(#loc34) + %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<512xi32> loc(#loc34) + %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<512xi32> loc(#loc35) + %tmp0_24 = arith.constant 294912 : i32 loc(#loc36) + %tmp0_25 = arith.constant 294912 : i32 loc(#loc36) + %tmp0_26 = arith.constant dense<294912> : tensor<512xi32> loc(#loc36) + %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<512xi32> loc(#loc36) + %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<512xi32> loc(#loc37) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc38) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc38) + %tmp0_31 = tt.load %tmp0_30 : tensor<512x!tt.ptr> loc(#loc39) + %tmp0_32 = arith.extf %tmp0_31 : tensor<512xbf16> to tensor<512xf32> loc(#loc40) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc18) + %2 = arith.truncf %tmp0_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc19) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:65) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4) +#loc24 = loc("xnumel"(#loc1)) +#loc25 = loc("xoffset"(#loc2)) +#loc26 = loc("xoffset"(#loc3)) +#loc27 = loc("xindex"(#loc4)) +#loc28 = loc("xindex"(#loc5)) +#loc29 = loc("xmask"(#loc6)) +#loc30 = loc("x0"(#loc7)) +#loc31 = loc("x1"(#loc8)) +#loc32 = loc("x1"(#loc9)) +#loc33 = loc("x2"(#loc10)) +#loc34 = loc("tmp0"(#loc11)) +#loc35 = loc("tmp0"(#loc12)) +#loc36 = loc("tmp0"(#loc13)) +#loc37 = loc("tmp0"(#loc14)) +#loc38 = loc("tmp0"(#loc15)) +#loc39 = loc("tmp0"(#loc16)) +#loc40 = loc("tmp0"(#loc17)) diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttgir b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..c5e0fe984cd01748c1fff1ba3394b9b47e4c0d7d --- /dev/null +++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttgir @@ -0,0 +1,66 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<512xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32> : tensor<512xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<294912> : tensor<512xi32, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc22) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc23) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc24) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc25) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc25) + %x0 = arith.remsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc26) + %x1 = arith.divsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc27) + %x1_6 = arith.remsi %x1, %cst_0 : tensor<512xi32, #blocked> loc(#loc28) + %x2 = arith.divsi %xindex_5, %cst_1 : tensor<512xi32, #blocked> loc(#loc29) + %tmp0 = arith.muli %x2, %cst : tensor<512xi32, #blocked> loc(#loc30) + %tmp0_7 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc31) + %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<512xi32, #blocked> loc(#loc32) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32, #blocked> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr, #blocked> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc16) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<512x!tt.ptr, #blocked> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4) +#loc22 = loc("xoffset"(#loc2)) +#loc23 = loc("xoffset"(#loc3)) +#loc24 = loc("xindex"(#loc4)) +#loc25 = loc("xindex"(#loc5)) +#loc26 = loc("x0"(#loc6)) +#loc27 = loc("x1"(#loc7)) +#loc28 = loc("x1"(#loc8)) +#loc29 = loc("x2"(#loc9)) +#loc30 = loc("tmp0"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttir b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..00b50056d5c6bc28caa71748fc7614cb1a9852ec --- /dev/null +++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttir @@ -0,0 +1,65 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0) +#loc19 = loc("in_ptr0"(#loc)) +#loc20 = loc("out_ptr0"(#loc)) +#loc21 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<294912> : tensor<512xi32> loc(#loc22) + %x2 = arith.constant dense<4096> : tensor<512xi32> loc(#loc23) + %x1 = arith.constant dense<32> : tensor<512xi32> loc(#loc24) + %cst = arith.constant dense<128> : tensor<512xi32> loc(#loc4) + %c512_i32 = arith.constant 512 : i32 loc(#loc4) + %xoffset = tt.get_program_id x : i32 loc(#loc25) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc26) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc28) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc28) + %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32> loc(#loc29) + %x1_3 = arith.divsi %xindex_2, %cst : tensor<512xi32> loc(#loc30) + %x1_4 = arith.remsi %x1_3, %x1 : tensor<512xi32> loc(#loc24) + %x2_5 = arith.divsi %xindex_2, %x2 : tensor<512xi32> loc(#loc23) + %tmp0_6 = arith.muli %x2_5, %cst : tensor<512xi32> loc(#loc31) + %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<512xi32> loc(#loc32) + %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<512xi32> loc(#loc22) + %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32> loc(#loc33) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc34) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc34) + %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr> loc(#loc35) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc16) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc16) + tt.store %1, %tmp0_12 : tensor<512x!tt.ptr> loc(#loc17) + tt.return loc(#loc18) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28) +#loc4 = loc(unknown) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4) +#loc22 = loc("tmp0"(#loc1)) +#loc23 = loc("x2"(#loc2)) +#loc24 = loc("x1"(#loc3)) +#loc25 = loc("xoffset"(#loc5)) +#loc26 = loc("xoffset"(#loc6)) +#loc27 = loc("xindex"(#loc7)) +#loc28 = loc("xindex"(#loc8)) +#loc29 = loc("x0"(#loc9)) +#loc30 = loc("x1"(#loc10)) +#loc31 = loc("tmp0"(#loc11)) +#loc32 = loc("tmp0"(#loc12)) +#loc33 = loc("tmp0"(#loc13)) +#loc34 = loc("tmp0"(#loc14)) +#loc35 = loc("tmp0"(#loc15)) diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/__grp__triton_poi_fused_cat_view_4.json b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/__grp__triton_poi_fused_cat_view_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c23fdadb3fe3d400c2bcb118bb2c3c7178d54586 --- /dev/null +++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/__grp__triton_poi_fused_cat_view_4.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_cat_view_4.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.source", "triton_poi_fused_cat_view_4.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttir", "triton_poi_fused_cat_view_4.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttgir", "triton_poi_fused_cat_view_4.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.llir", "triton_poi_fused_cat_view_4.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ptx", "triton_poi_fused_cat_view_4.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.cubin", "triton_poi_fused_cat_view_4.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.json"}} \ No newline at end of file diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.cubin b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.cubin new file mode 100644 index 0000000000000000000000000000000000000000..2908ced25ce7f2d66da17fab9904d83c3a104833 Binary files /dev/null and b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.cubin differ diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.json b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ed23af6fb2dfd7a453b4a9b8a48b3333d4511692 --- /dev/null +++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.json @@ -0,0 +1 @@ +{"hash": "bb69f753eede91770f9498b43d000dbd5de341f220a53769748065b725226b3b", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_cat_view_4"} \ No newline at end of file diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.llir b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.llir new file mode 100644 index 0000000000000000000000000000000000000000..7b6e85f5406c06c3962d20369415547d3b235ad5 --- /dev/null +++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.llir @@ -0,0 +1,119 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_cat_view_4(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 10, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = shl nuw nsw i32 %9, 3, !dbg !9 + %11 = and i32 %10, 1016, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = sdiv i32 %12, 4096, !dbg !11 + %14 = icmp slt i32 %12, 1048576, !dbg !12 + %15 = shl i32 %13, 13, !dbg !13 + %16 = add i32 %15, %12, !dbg !13 + %17 = sext i32 %16 to i64, !dbg !14 + %18 = getelementptr bfloat, ptr addrspace(1) %0, i64 %17, !dbg !14 + %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %18, i1 %14) #2, !dbg !15 + %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !15 + %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !15 + %22 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !15 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !15 + %24 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !15 + %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !15 + %26 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !15 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !15 + %28 = extractelement <2 x bfloat> %21, i64 0, !dbg !15 + %29 = extractelement <2 x bfloat> %21, i64 1, !dbg !15 + %30 = extractelement <2 x bfloat> %23, i64 0, !dbg !15 + %31 = extractelement <2 x bfloat> %23, i64 1, !dbg !15 + %32 = extractelement <2 x bfloat> %25, i64 0, !dbg !15 + %33 = extractelement <2 x bfloat> %25, i64 1, !dbg !15 + %34 = extractelement <2 x bfloat> %27, i64 0, !dbg !15 + %35 = extractelement <2 x bfloat> %27, i64 1, !dbg !15 + %36 = icmp sgt i32 %12, 1048575, !dbg !16 + %37 = add i32 %16, -3145728, !dbg !17 + %38 = sext i32 %37 to i64, !dbg !18 + %39 = getelementptr bfloat, ptr addrspace(1) %1, i64 %38, !dbg !18 + %40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %39, i1 %36) #2, !dbg !19 + %41 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !19 + %42 = bitcast i32 %41 to <2 x bfloat>, !dbg !19 + %43 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !19 + %44 = bitcast i32 %43 to <2 x bfloat>, !dbg !19 + %45 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !19 + %46 = bitcast i32 %45 to <2 x bfloat>, !dbg !19 + %47 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !19 + %48 = bitcast i32 %47 to <2 x bfloat>, !dbg !19 + %49 = extractelement <2 x bfloat> %42, i64 0, !dbg !19 + %50 = extractelement <2 x bfloat> %42, i64 1, !dbg !19 + %51 = extractelement <2 x bfloat> %44, i64 0, !dbg !19 + %52 = extractelement <2 x bfloat> %44, i64 1, !dbg !19 + %53 = extractelement <2 x bfloat> %46, i64 0, !dbg !19 + %54 = extractelement <2 x bfloat> %46, i64 1, !dbg !19 + %55 = extractelement <2 x bfloat> %48, i64 0, !dbg !19 + %56 = extractelement <2 x bfloat> %48, i64 1, !dbg !19 + %.v = select i1 %14, bfloat %28, bfloat %49, !dbg !20 + %.v1 = select i1 %14, bfloat %29, bfloat %50, !dbg !20 + %.v2 = select i1 %14, bfloat %30, bfloat %51, !dbg !20 + %.v3 = select i1 %14, bfloat %31, bfloat %52, !dbg !20 + %.v4 = select i1 %14, bfloat %32, bfloat %53, !dbg !20 + %.v5 = select i1 %14, bfloat %33, bfloat %54, !dbg !20 + %.v6 = select i1 %14, bfloat %34, bfloat %55, !dbg !20 + %.v7 = select i1 %14, bfloat %35, bfloat %56, !dbg !20 + %57 = sext i32 %12 to i64, !dbg !21 + %58 = getelementptr bfloat, ptr addrspace(1) %2, i64 %57, !dbg !21 + %59 = insertelement <2 x bfloat> poison, bfloat %.v, i64 0, !dbg !22 + %60 = insertelement <2 x bfloat> %59, bfloat %.v1, i64 1, !dbg !22 + %61 = bitcast <2 x bfloat> %60 to i32, !dbg !22 + %62 = insertelement <2 x bfloat> poison, bfloat %.v2, i64 0, !dbg !22 + %63 = insertelement <2 x bfloat> %62, bfloat %.v3, i64 1, !dbg !22 + %64 = bitcast <2 x bfloat> %63 to i32, !dbg !22 + %65 = insertelement <2 x bfloat> poison, bfloat %.v4, i64 0, !dbg !22 + %66 = insertelement <2 x bfloat> %65, bfloat %.v5, i64 1, !dbg !22 + %67 = bitcast <2 x bfloat> %66 to i32, !dbg !22 + %68 = insertelement <2 x bfloat> poison, bfloat %.v6, i64 0, !dbg !22 + %69 = insertelement <2 x bfloat> %68, bfloat %.v7, i64 1, !dbg !22 + %70 = bitcast <2 x bfloat> %69 to i32, !dbg !22 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %61, i32 %64, i32 %67, i32 %70, ptr addrspace(1) %58) #2, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_cat_view_4", linkageName: "triton_poi_fused_cat_view_4", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 23, column: 19, scope: !4) +!12 = !DILocation(line: 30, column: 18, scope: !4) +!13 = !DILocation(line: 31, column: 35, scope: !4) +!14 = !DILocation(line: 31, column: 30, scope: !4) +!15 = !DILocation(line: 31, column: 48, scope: !4) +!16 = !DILocation(line: 32, column: 19, scope: !4) +!17 = !DILocation(line: 35, column: 35, scope: !4) +!18 = !DILocation(line: 35, column: 30, scope: !4) +!19 = !DILocation(line: 35, column: 57, scope: !4) +!20 = !DILocation(line: 36, column: 33, scope: !4) +!21 = !DILocation(line: 37, column: 25, scope: !4) +!22 = !DILocation(line: 37, column: 37, scope: !4) +!23 = !DILocation(line: 37, column: 4, scope: !4) diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ptx b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c00f6a727cc6b5d9f092c2aafd7d2b85748cd004 --- /dev/null +++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ptx @@ -0,0 +1,354 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_cat_view_4 // -- Begin function triton_poi_fused_cat_view_4 + // @triton_poi_fused_cat_view_4 +.visible .entry triton_poi_fused_cat_view_4( + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_2, + .param .u32 triton_poi_fused_cat_view_4_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_5 +) +.reqntid 128 +{ + .reg .pred %p<3>; + .reg .b16 %rs<25>; + .reg .b32 %r<27>; + .reg .b64 %rd<7>; + .loc 1 18 0 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:18:0 +$L__func_begin0: + .loc 1 18 0 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_poi_fused_cat_view_4_param_0]; + ld.param.b64 %rd5, [triton_poi_fused_cat_view_4_param_1]; +$L__tmp0: + .loc 1 20 28 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:20:28 + mov.u32 %r14, %ctaid.x; + .loc 1 20 33 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:20:33 + shl.b32 %r15, %r14, 10; + ld.param.b64 %rd6, [triton_poi_fused_cat_view_4_param_2]; + .loc 1 21 36 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:21:36 + mov.u32 %r16, %tid.x; + shl.b32 %r17, %r16, 3; + and.b32 %r18, %r17, 1016; + .loc 1 21 23 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:21:23 + or.b32 %r19, %r18, %r15; + .loc 1 23 19 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:23:19 + bfe.s32 %r20, %r14, 21, 1; + shr.u32 %r21, %r20, 20; + add.s32 %r22, %r19, %r21; + .loc 1 30 18 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:30:18 + setp.lt.s32 %p1, %r19, 1048576; + .loc 1 31 35 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:35 + shl.b32 %r23, %r22, 1; + and.b32 %r24, %r23, -8192; + add.s32 %r25, %r24, %r19; + .loc 1 31 30 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:30 + mad.wide.s32 %rd1, %r25, 2, %rd4; + mov.b32 %r5, 0; + .loc 1 31 48 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:48 + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ]; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + mov.b32 {%rs5, %rs6}, %r3; + mov.b32 {%rs7, %rs8}, %r4; + .loc 1 32 19 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:32:19 + setp.gt.s32 %p2, %r19, 1048575; + .loc 1 35 35 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:35 + add.s32 %r26, %r25, -3145728; + .loc 1 35 30 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:30 + mad.wide.s32 %rd2, %r26, 2, %rd5; + .loc 1 35 57 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:57 + // begin inline asm + mov.u32 %r6, %r5; + mov.u32 %r7, %r5; + mov.u32 %r8, %r5; + mov.u32 %r9, %r5; + @%p2 ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd2 + 0 ]; + // end inline asm + mov.b32 {%rs9, %rs10}, %r6; + mov.b32 {%rs11, %rs12}, %r7; + mov.b32 {%rs13, %rs14}, %r8; + mov.b32 {%rs15, %rs16}, %r9; + .loc 1 36 33 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:36:33 + selp.b16 %rs17, %rs1, %rs9, %p1; + selp.b16 %rs18, %rs2, %rs10, %p1; + selp.b16 %rs19, %rs3, %rs11, %p1; + selp.b16 %rs20, %rs4, %rs12, %p1; + selp.b16 %rs21, %rs5, %rs13, %p1; + selp.b16 %rs22, %rs6, %rs14, %p1; + selp.b16 %rs23, %rs7, %rs15, %p1; + selp.b16 %rs24, %rs8, %rs16, %p1; + .loc 1 37 25 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:25 + mad.wide.s32 %rd3, %r19, 2, %rd6; + .loc 1 37 37 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:37 + mov.b32 %r10, {%rs17, %rs18}; + mov.b32 %r11, {%rs19, %rs20}; + mov.b32 %r12, {%rs21, %rs22}; + mov.b32 %r13, {%rs23, %rs24}; + // begin inline asm + st.global.v4.b32 [ %rd3 + 0 ], { %r10, %r11, %r12, %r13 }; + // end inline asm + .loc 1 37 4 // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 108 +.b8 112 +.b8 102 +.b8 52 +.b8 108 +.b8 111 +.b8 111 +.b8 104 +.b8 102 +.b8 115 +.b8 103 +.b8 119 +.b8 113 +.b8 104 +.b8 50 +.b8 103 +.b8 105 +.b8 50 +.b8 120 +.b8 111 +.b8 118 +.b8 111 +.b8 100 +.b8 112 +.b8 109 +.b8 55 +.b8 104 +.b8 122 +.b8 118 +.b8 53 +.b8 117 +.b8 50 +.b8 114 +.b8 118 +.b8 110 +.b8 103 +.b8 98 +.b8 55 +.b8 99 +.b8 104 +.b8 106 +.b8 103 +.b8 121 +.b8 119 +.b8 120 +.b8 53 +.b8 53 +.b8 103 +.b8 116 +.b8 117 +.b8 100 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 108 +.b8 112 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.source b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.source new file mode 100644 index 0000000000000000000000000000000000000000..2fb5cfd9779313c8bed0c64788bf0ef349e6fcee --- /dev/null +++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.source @@ -0,0 +1,136 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0) +#loc31 = loc("in_ptr0"(#loc)) +#loc32 = loc("in_ptr1"(#loc)) +#loc33 = loc("out_ptr0"(#loc)) +#loc34 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc35) + %xoffset = tt.get_program_id x : i32 loc(#loc36) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc37) + %xoffset_2 = arith.constant 1024 : i32 loc(#loc37) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc37) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc38) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc39) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc39) + %xmask = arith.constant true loc(#loc40) + %xmask_6 = arith.constant dense : tensor<1024xi1> loc(#loc40) + %x1 = arith.constant 4096 : i32 loc(#loc41) + %x1_7 = arith.constant 4096 : i32 loc(#loc41) + %x1_8 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc41) + %x1_9 = arith.divsi %xindex_5, %x1_8 : tensor<1024xi32> loc(#loc41) + %x0 = arith.constant 4096 : i32 loc(#loc42) + %x0_10 = arith.constant 4096 : i32 loc(#loc42) + %x0_11 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc42) + %x0_12 = arith.remsi %xindex_5, %x0_11 : tensor<1024xi32> loc(#loc42) + %tmp1 = arith.constant 0 : i64 loc(#loc43) + %tmp1_13 = arith.constant dense<0> : tensor<1xi64> loc(#loc43) + %tmp2 = arith.extsi %x1_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc44) + %tmp2_14 = arith.constant dense<0> : tensor<1024xi64> loc(#loc44) + %tmp2_15 = arith.cmpi sge, %tmp2, %tmp2_14 : tensor<1024xi64> loc(#loc44) + %tmp3 = arith.constant 256 : i64 loc(#loc45) + %tmp3_16 = arith.constant dense<256> : tensor<1xi64> loc(#loc45) + %tmp4 = arith.extsi %x1_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc46) + %tmp4_17 = arith.constant dense<256> : tensor<1024xi64> loc(#loc46) + %tmp4_18 = arith.cmpi slt, %tmp4, %tmp4_17 : tensor<1024xi64> loc(#loc46) + %tmp5 = arith.constant 12288 : i32 loc(#loc47) + %tmp5_19 = arith.constant 12288 : i32 loc(#loc47) + %tmp5_20 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc47) + %tmp5_21 = arith.muli %tmp5_20, %x1_9 : tensor<1024xi32> loc(#loc47) + %tmp5_22 = arith.addi %x0_12, %tmp5_21 : tensor<1024xi32> loc(#loc48) + %tmp5_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc49) + %tmp5_24 = tt.addptr %tmp5_23, %tmp5_22 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc49) + %tmp5_25 = arith.constant 0.000000e+00 : f32 loc(#loc50) + %tmp5_26 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc50) + %tmp5_27 = arith.truncf %tmp5_26 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc50) + %tmp5_28 = tt.load %tmp5_24, %tmp4_18, %tmp5_27 : tensor<1024x!tt.ptr> loc(#loc50) + %tmp5_29 = arith.extf %tmp5_28 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc51) + %tmp6 = arith.extsi %x1_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc52) + %tmp6_30 = arith.constant dense<256> : tensor<1024xi64> loc(#loc52) + %tmp6_31 = arith.cmpi sge, %tmp6, %tmp6_30 : tensor<1024xi64> loc(#loc52) + %tmp7 = arith.constant 2304 : i64 loc(#loc53) + %tmp7_32 = arith.constant dense<2304> : tensor<1xi64> loc(#loc53) + %tmp8 = arith.extsi %x1_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc54) + %tmp8_33 = arith.constant dense<2304> : tensor<1024xi64> loc(#loc54) + %tmp8_34 = arith.cmpi slt, %tmp8, %tmp8_33 : tensor<1024xi64> loc(#loc54) + %tmp9 = arith.constant -256 : i32 loc(#loc55) + %tmp9_35 = arith.constant -256 : i32 loc(#loc55) + %tmp9_36 = arith.constant dense<-256> : tensor<1024xi32> loc(#loc55) + %tmp9_37 = arith.addi %tmp9_36, %x1_9 : tensor<1024xi32> loc(#loc55) + %tmp9_38 = arith.constant 12288 : i32 loc(#loc56) + %tmp9_39 = arith.constant 12288 : i32 loc(#loc56) + %tmp9_40 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc56) + %tmp9_41 = arith.muli %tmp9_40, %tmp9_37 : tensor<1024xi32> loc(#loc56) + %tmp9_42 = arith.addi %x0_12, %tmp9_41 : tensor<1024xi32> loc(#loc57) + %tmp9_43 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc58) + %tmp9_44 = tt.addptr %tmp9_43, %tmp9_42 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc58) + %tmp9_45 = arith.constant 0.000000e+00 : f32 loc(#loc59) + %tmp9_46 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc59) + %tmp9_47 = arith.truncf %tmp9_46 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc59) + %tmp9_48 = tt.load %tmp9_44, %tmp6_31, %tmp9_47 : tensor<1024x!tt.ptr> loc(#loc59) + %tmp9_49 = arith.extf %tmp9_48 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc60) + %tmp10 = arith.select %tmp4_18, %tmp5_29, %tmp9_49 : tensor<1024xi1>, tensor<1024xf32> loc(#loc61) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc28) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc28) + %2 = arith.truncf %tmp10 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc29) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc29) + tt.return loc(#loc30) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":27:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":29:29) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":33:30) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":34:18) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4) +#loc35 = loc("xnumel"(#loc1)) +#loc36 = loc("xoffset"(#loc2)) +#loc37 = loc("xoffset"(#loc3)) +#loc38 = loc("xindex"(#loc4)) +#loc39 = loc("xindex"(#loc5)) +#loc40 = loc("xmask"(#loc6)) +#loc41 = loc("x1"(#loc7)) +#loc42 = loc("x0"(#loc8)) +#loc43 = loc("tmp1"(#loc9)) +#loc44 = loc("tmp2"(#loc10)) +#loc45 = loc("tmp3"(#loc11)) +#loc46 = loc("tmp4"(#loc12)) +#loc47 = loc("tmp5"(#loc13)) +#loc48 = loc("tmp5"(#loc14)) +#loc49 = loc("tmp5"(#loc15)) +#loc50 = loc("tmp5"(#loc16)) +#loc51 = loc("tmp5"(#loc17)) +#loc52 = loc("tmp6"(#loc18)) +#loc53 = loc("tmp7"(#loc19)) +#loc54 = loc("tmp8"(#loc20)) +#loc55 = loc("tmp9"(#loc21)) +#loc56 = loc("tmp9"(#loc22)) +#loc57 = loc("tmp9"(#loc23)) +#loc58 = loc("tmp9"(#loc24)) +#loc59 = loc("tmp9"(#loc25)) +#loc60 = loc("tmp9"(#loc26)) +#loc61 = loc("tmp10"(#loc27)) diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttgir b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..100ff20cefc58e82bde6e7d20523caec17d3427b --- /dev/null +++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttgir @@ -0,0 +1,89 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0) +#loc25 = loc("in_ptr0"(#loc)) +#loc26 = loc("in_ptr1"(#loc)) +#loc27 = loc("out_ptr0"(#loc)) +#loc28 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<256> : tensor<1024xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<-256> : tensor<1024xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc29) + %xoffset_4 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc30) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc31) + %xindex_5 = tt.splat %xoffset_4 : i32 -> tensor<1024xi32, #blocked> loc(#loc32) + %xindex_6 = arith.addi %xindex_5, %xindex : tensor<1024xi32, #blocked> loc(#loc32) + %x1 = arith.divsi %xindex_6, %cst : tensor<1024xi32, #blocked> loc(#loc33) + %x0 = arith.remsi %xindex_6, %cst : tensor<1024xi32, #blocked> loc(#loc34) + %tmp4 = arith.extsi %x1 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc35) + %tmp4_7 = arith.cmpi slt, %tmp4, %cst_0 : tensor<1024xi64, #blocked> loc(#loc35) + %tmp5 = arith.muli %x1, %cst_1 : tensor<1024xi32, #blocked> loc(#loc36) + %tmp5_8 = arith.addi %x0, %tmp5 : tensor<1024xi32, #blocked> loc(#loc37) + %tmp5_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc38) + %tmp5_10 = tt.addptr %tmp5_9, %tmp5_8 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc38) + %tmp5_11 = tt.load %tmp5_10, %tmp4_7, %cst_3 : tensor<1024x!tt.ptr, #blocked> loc(#loc39) + %tmp5_12 = arith.extf %tmp5_11 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc40) + %tmp6 = arith.cmpi sge, %tmp4, %cst_0 : tensor<1024xi64, #blocked> loc(#loc41) + %tmp9 = arith.addi %x1, %cst_2 : tensor<1024xi32, #blocked> loc(#loc42) + %tmp9_13 = arith.muli %tmp9, %cst_1 : tensor<1024xi32, #blocked> loc(#loc43) + %tmp9_14 = arith.addi %x0, %tmp9_13 : tensor<1024xi32, #blocked> loc(#loc44) + %tmp9_15 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc45) + %tmp9_16 = tt.addptr %tmp9_15, %tmp9_14 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc45) + %tmp9_17 = tt.load %tmp9_16, %tmp6, %cst_3 : tensor<1024x!tt.ptr, #blocked> loc(#loc46) + %tmp9_18 = arith.extf %tmp9_17 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc47) + %tmp10 = arith.select %tmp4_7, %tmp5_12, %tmp9_18 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc48) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc22) + %1 = tt.addptr %0, %xindex_6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc22) + %2 = arith.truncf %tmp10 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc23) + tt.store %1, %2 : tensor<1024x!tt.ptr, #blocked> loc(#loc23) + tt.return loc(#loc24) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4) +#loc29 = loc("xoffset"(#loc2)) +#loc30 = loc("xoffset"(#loc3)) +#loc31 = loc("xindex"(#loc4)) +#loc32 = loc("xindex"(#loc5)) +#loc33 = loc("x1"(#loc6)) +#loc34 = loc("x0"(#loc7)) +#loc35 = loc("tmp4"(#loc8)) +#loc36 = loc("tmp5"(#loc9)) +#loc37 = loc("tmp5"(#loc10)) +#loc38 = loc("tmp5"(#loc11)) +#loc39 = loc("tmp5"(#loc12)) +#loc40 = loc("tmp5"(#loc13)) +#loc41 = loc("tmp6"(#loc14)) +#loc42 = loc("tmp9"(#loc15)) +#loc43 = loc("tmp9"(#loc16)) +#loc44 = loc("tmp9"(#loc17)) +#loc45 = loc("tmp9"(#loc18)) +#loc46 = loc("tmp9"(#loc19)) +#loc47 = loc("tmp9"(#loc20)) +#loc48 = loc("tmp10"(#loc21)) diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttir b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttir new file mode 100644 index 0000000000000000000000000000000000000000..511d76bc0ef4e2fd5a4e58f2fe591d6653997ca7 --- /dev/null +++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttir @@ -0,0 +1,88 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0) +#loc25 = loc("in_ptr0"(#loc)) +#loc26 = loc("in_ptr1"(#loc)) +#loc27 = loc("out_ptr0"(#loc)) +#loc28 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1024xbf16> loc(#loc1) + %tmp9 = arith.constant dense<-256> : tensor<1024xi32> loc(#loc29) + %cst_0 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc1) + %cst_1 = arith.constant dense<256> : tensor<1024xi64> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc30) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc31) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc32) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc33) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc33) + %x1 = arith.divsi %xindex_5, %cst_2 : tensor<1024xi32> loc(#loc34) + %x0 = arith.remsi %xindex_5, %cst_2 : tensor<1024xi32> loc(#loc35) + %tmp4 = arith.extsi %x1 : tensor<1024xi32> to tensor<1024xi64> loc(#loc36) + %tmp4_6 = arith.cmpi slt, %tmp4, %cst_1 : tensor<1024xi64> loc(#loc36) + %tmp5 = arith.muli %x1, %cst_0 : tensor<1024xi32> loc(#loc37) + %tmp5_7 = arith.addi %x0, %tmp5 : tensor<1024xi32> loc(#loc38) + %tmp5_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc39) + %tmp5_9 = tt.addptr %tmp5_8, %tmp5_7 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc39) + %tmp5_10 = tt.load %tmp5_9, %tmp4_6, %cst : tensor<1024x!tt.ptr> loc(#loc40) + %tmp5_11 = arith.extf %tmp5_10 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc41) + %tmp6 = arith.cmpi sge, %tmp4, %cst_1 : tensor<1024xi64> loc(#loc42) + %tmp9_12 = arith.addi %x1, %tmp9 : tensor<1024xi32> loc(#loc29) + %tmp9_13 = arith.muli %tmp9_12, %cst_0 : tensor<1024xi32> loc(#loc43) + %tmp9_14 = arith.addi %x0, %tmp9_13 : tensor<1024xi32> loc(#loc44) + %tmp9_15 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc45) + %tmp9_16 = tt.addptr %tmp9_15, %tmp9_14 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc45) + %tmp9_17 = tt.load %tmp9_16, %tmp6, %cst : tensor<1024x!tt.ptr> loc(#loc46) + %tmp9_18 = arith.extf %tmp9_17 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc47) + %tmp10 = arith.select %tmp4_6, %tmp5_11, %tmp9_18 : tensor<1024xi1>, tensor<1024xf32> loc(#loc48) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc22) + %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc22) + %2 = arith.truncf %tmp10 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc23) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc23) + tt.return loc(#loc24) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4) +#loc29 = loc("tmp9"(#loc2)) +#loc30 = loc("xoffset"(#loc3)) +#loc31 = loc("xoffset"(#loc4)) +#loc32 = loc("xindex"(#loc5)) +#loc33 = loc("xindex"(#loc6)) +#loc34 = loc("x1"(#loc7)) +#loc35 = loc("x0"(#loc8)) +#loc36 = loc("tmp4"(#loc9)) +#loc37 = loc("tmp5"(#loc10)) +#loc38 = loc("tmp5"(#loc11)) +#loc39 = loc("tmp5"(#loc12)) +#loc40 = loc("tmp5"(#loc13)) +#loc41 = loc("tmp5"(#loc14)) +#loc42 = loc("tmp6"(#loc15)) +#loc43 = loc("tmp9"(#loc16)) +#loc44 = loc("tmp9"(#loc17)) +#loc45 = loc("tmp9"(#loc18)) +#loc46 = loc("tmp9"(#loc19)) +#loc47 = loc("tmp9"(#loc20)) +#loc48 = loc("tmp10"(#loc21)) diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/__grp__triton_poi_fused_add_mul_0.json b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/__grp__triton_poi_fused_add_mul_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c8826882dc0439d11122b9daf69d9e95446804a6 --- /dev/null +++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/__grp__triton_poi_fused_add_mul_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_mul_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.source", "triton_poi_fused_add_mul_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttir", "triton_poi_fused_add_mul_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttgir", "triton_poi_fused_add_mul_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.llir", "triton_poi_fused_add_mul_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ptx", "triton_poi_fused_add_mul_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.cubin", "triton_poi_fused_add_mul_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.json"}} \ No newline at end of file diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.cubin b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a9ae9b5a27246b4efc6e824f67772a5b40c12fe6 Binary files /dev/null and b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.cubin differ diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.json b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5f1ab544c1f04c148f96f7d84db10e643858f6e5 --- /dev/null +++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.json @@ -0,0 +1 @@ +{"hash": "bbe70e424e252b7b1c0af0b2ecd3da03e9e9fb20155d2cbdcdcf8f1405431e6f", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_0"} \ No newline at end of file diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.llir b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..4860bccd1529052ec12391a5aad030f864c00594 --- /dev/null +++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.llir @@ -0,0 +1,76 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_add_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 9, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = shl nuw nsw i32 %10, 1, !dbg !9 + %12 = and i32 %11, 510, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = srem i32 %13, 4096, !dbg !11 + %15 = sext i32 %13 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12 + %17 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %16) #2, !dbg !13 + %18 = bitcast i32 %17 to <2 x bfloat>, !dbg !13 + %19 = sext i32 %14 to i64, !dbg !14 + %20 = getelementptr bfloat, ptr addrspace(1) %1, i64 %19, !dbg !14 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15 + %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l"(ptr addrspace(1) %20, i64 %21) #2, !dbg !15 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !15 + %24 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16 + %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %24) #2, !dbg !17 + %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17 + %27 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18 + %28 = fpext <2 x bfloat> %18 to <2 x float>, !dbg !19 + %29 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !20 + %30 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !21 + %31 = fmul <2 x float> %29, %30, !dbg !22 + %32 = fadd <2 x float> %31, %28, !dbg !23 + %33 = fptrunc <2 x float> %32 to <2 x bfloat>, !dbg !24 + %34 = bitcast <2 x bfloat> %33 to i32, !dbg !24 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %34, ptr addrspace(1) %27) #2, !dbg !24 + ret void, !dbg !25 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_0", linkageName: "triton_poi_fused_add_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 19, scope: !4) +!12 = !DILocation(line: 25, column: 30, scope: !4) +!13 = !DILocation(line: 25, column: 35, scope: !4) +!14 = !DILocation(line: 26, column: 30, scope: !4) +!15 = !DILocation(line: 26, column: 35, scope: !4) +!16 = !DILocation(line: 27, column: 30, scope: !4) +!17 = !DILocation(line: 27, column: 35, scope: !4) +!18 = !DILocation(line: 30, column: 25, scope: !4) +!19 = !DILocation(line: 25, column: 44, scope: !4) +!20 = !DILocation(line: 26, column: 74, scope: !4) +!21 = !DILocation(line: 27, column: 44, scope: !4) +!22 = !DILocation(line: 28, column: 18, scope: !4) +!23 = !DILocation(line: 29, column: 18, scope: !4) +!24 = !DILocation(line: 30, column: 36, scope: !4) +!25 = !DILocation(line: 30, column: 4, scope: !4) diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ptx b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..377378ae7ef2d5c43af9668539bd36bcd47de934 --- /dev/null +++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ptx @@ -0,0 +1,347 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_add_mul_0 // -- Begin function triton_poi_fused_add_mul_0 + // @triton_poi_fused_add_mul_0 +.visible .entry triton_poi_fused_add_mul_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_3, + .param .u32 triton_poi_fused_add_mul_0_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_6 +) +.reqntid 256 +{ + .reg .b16 %rs<7>; + .reg .b32 %r<24>; + .reg .b64 %rd<11>; + .loc 1 18 0 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:18:0 + +// %bb.0: + ld.param.b64 %rd6, [triton_poi_fused_add_mul_0_param_0]; + ld.param.b64 %rd7, [triton_poi_fused_add_mul_0_param_1]; +$L__tmp0: + .loc 1 20 28 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:20:28 + mov.u32 %r5, %ctaid.x; + .loc 1 20 33 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:20:33 + shl.b32 %r6, %r5, 9; + ld.param.b64 %rd8, [triton_poi_fused_add_mul_0_param_2]; + ld.param.b64 %rd9, [triton_poi_fused_add_mul_0_param_3]; + .loc 1 21 36 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:21:36 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 1; + and.b32 %r9, %r8, 510; + .loc 1 21 23 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:21:23 + or.b32 %r10, %r9, %r6; + .loc 1 24 19 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:24:19 + bfe.s32 %r11, %r5, 22, 1; + shr.u32 %r12, %r11, 20; + add.s32 %r13, %r10, %r12; + and.b32 %r14, %r13, -4096; + sub.s32 %r15, %r10, %r14; + .loc 1 25 30 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:30 + mul.wide.s32 %rd10, %r10, 2; + add.s64 %rd1, %rd6, %rd10; + .loc 1 25 35 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:35 + // begin inline asm + mov.u32 %r1, 0x0; + ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 26 30 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:30 + mad.wide.s32 %rd2, %r15, 2, %rd7; + .loc 1 26 35 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:35 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + ld.global.L1::evict_last.L2::cache_hint.b32 { %r2 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 27 30 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:30 + add.s64 %rd4, %rd8, %rd10; + .loc 1 27 35 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:35 + // begin inline asm + mov.u32 %r3, 0x0; + ld.global.b32 { %r3 }, [ %rd4 + 0 ]; + // end inline asm + .loc 1 30 25 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:25 + add.s64 %rd5, %rd9, %rd10; + .loc 1 25 44 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:44 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r16, %rs2; + cvt.f32.bf16 %r17, %rs1; + .loc 1 26 74 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:74 + mov.b32 {%rs3, %rs4}, %r2; + cvt.f32.bf16 %r18, %rs4; + cvt.f32.bf16 %r19, %rs3; + .loc 1 27 44 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:44 + mov.b32 {%rs5, %rs6}, %r3; + cvt.f32.bf16 %r20, %rs6; + cvt.f32.bf16 %r21, %rs5; + .loc 1 29 18 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:29:18 + fma.rn.f32 %r22, %r19, %r21, %r17; + fma.rn.f32 %r23, %r18, %r20, %r16; + .loc 1 30 36 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:36 + cvt.rn.bf16x2.f32 %r4, %r23, %r22; + // begin inline asm + st.global.b32 [ %rd5 + 0 ], { %r4 }; + // end inline asm + .loc 1 30 4 // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 224 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 120 +.b8 106 +.b8 52 +.b8 112 +.b8 53 +.b8 51 +.b8 104 +.b8 111 +.b8 116 +.b8 118 +.b8 119 +.b8 51 +.b8 51 +.b8 54 +.b8 119 +.b8 52 +.b8 106 +.b8 54 +.b8 106 +.b8 54 +.b8 110 +.b8 108 +.b8 121 +.b8 100 +.b8 119 +.b8 120 +.b8 122 +.b8 114 +.b8 115 +.b8 52 +.b8 104 +.b8 104 +.b8 107 +.b8 106 +.b8 52 +.b8 50 +.b8 104 +.b8 111 +.b8 102 +.b8 108 +.b8 111 +.b8 116 +.b8 50 +.b8 110 +.b8 115 +.b8 122 +.b8 113 +.b8 122 +.b8 113 +.b8 51 +.b8 117 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 120 +.b8 106 +.b8 0 + } + .section .debug_macinfo { } diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.source b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.source new file mode 100644 index 0000000000000000000000000000000000000000..e852b1ad9f9b3d0f898a7172708dc2db1a0de9ac --- /dev/null +++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.source @@ -0,0 +1,82 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0) +#loc22 = loc("in_ptr0"(#loc)) +#loc23 = loc("in_ptr1"(#loc)) +#loc24 = loc("in_ptr2"(#loc)) +#loc25 = loc("out_ptr0"(#loc)) +#loc26 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 9437184 : i32 loc(#loc27) + %xoffset = tt.get_program_id x : i32 loc(#loc28) + %xoffset_1 = arith.constant 512 : i32 loc(#loc29) + %xoffset_2 = arith.constant 512 : i32 loc(#loc29) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc30) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc31) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc31) + %xmask = arith.constant true loc(#loc32) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc32) + %x0 = arith.constant 4096 : i32 loc(#loc33) + %x0_7 = arith.constant 4096 : i32 loc(#loc33) + %x0_8 = arith.constant dense<4096> : tensor<512xi32> loc(#loc33) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc33) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc34) + %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc34) + %tmp0_11 = tt.load %tmp0_10 : tensor<512x!tt.ptr> loc(#loc35) + %tmp0_12 = arith.extf %tmp0_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc36) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc37) + %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc37) + %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc38) + %tmp1_15 = arith.extf %tmp1_14 : tensor<512xbf16> to tensor<512xf32> loc(#loc39) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc40) + %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc40) + %tmp2_17 = tt.load %tmp2_16 : tensor<512x!tt.ptr> loc(#loc41) + %tmp2_18 = arith.extf %tmp2_17 : tensor<512xbf16> to tensor<512xf32> loc(#loc42) + %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<512xf32> loc(#loc43) + %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<512xf32> loc(#loc44) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc19) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc19) + %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc20) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc20) + tt.return loc(#loc21) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4) +#loc27 = loc("xnumel"(#loc1)) +#loc28 = loc("xoffset"(#loc2)) +#loc29 = loc("xoffset"(#loc3)) +#loc30 = loc("xindex"(#loc4)) +#loc31 = loc("xindex"(#loc5)) +#loc32 = loc("xmask"(#loc6)) +#loc33 = loc("x0"(#loc7)) +#loc34 = loc("tmp0"(#loc8)) +#loc35 = loc("tmp0"(#loc9)) +#loc36 = loc("tmp0"(#loc10)) +#loc37 = loc("tmp1"(#loc11)) +#loc38 = loc("tmp1"(#loc12)) +#loc39 = loc("tmp1"(#loc13)) +#loc40 = loc("tmp2"(#loc14)) +#loc41 = loc("tmp2"(#loc15)) +#loc42 = loc("tmp2"(#loc16)) +#loc43 = loc("tmp3"(#loc17)) +#loc44 = loc("tmp4"(#loc18)) diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttgir b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..3aefc0741e1a21f89c015f7b92831a9bb9619c24 --- /dev/null +++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttgir @@ -0,0 +1,74 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc26) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc27) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc28) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32, #blocked> loc(#loc29) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32, #blocked> loc(#loc29) + %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32, #blocked> loc(#loc30) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc31) + %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc31) + %tmp0_4 = tt.load %tmp0_3 : tensor<512x!tt.ptr, #blocked> loc(#loc32) + %tmp0_5 = arith.extf %tmp0_4 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc34) + %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc34) + %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc35) + %tmp1_8 = arith.extf %tmp1_7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc37) + %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc37) + %tmp2_10 = tt.load %tmp2_9 : tensor<512x!tt.ptr, #blocked> loc(#loc38) + %tmp2_11 = arith.extf %tmp2_10 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc39) + %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<512xf32, #blocked> loc(#loc40) + %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<512xf32, #blocked> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc19) + tt.store %1, %2 : tensor<512x!tt.ptr, #blocked> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4) +#loc26 = loc("xoffset"(#loc2)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xindex"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("x0"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttir b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..ea1a5ed49ca40edb75f2e0618a05ff023f5b2322 --- /dev/null +++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttir @@ -0,0 +1,73 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0) +#loc21 = loc("in_ptr0"(#loc)) +#loc22 = loc("in_ptr1"(#loc)) +#loc23 = loc("in_ptr2"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %x0 = arith.constant dense<4096> : tensor<512xi32> loc(#loc26) + %c512_i32 = arith.constant 512 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc27) + %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc28) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc29) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc30) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc30) + %x0_3 = arith.remsi %xindex_2, %x0 : tensor<512xi32> loc(#loc26) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc31) + %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc31) + %tmp0_5 = tt.load %tmp0_4 : tensor<512x!tt.ptr> loc(#loc32) + %tmp0_6 = arith.extf %tmp0_5 : tensor<512xbf16> to tensor<512xf32> loc(#loc33) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc34) + %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc34) + %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc35) + %tmp1_9 = arith.extf %tmp1_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc36) + %tmp2 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc37) + %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc37) + %tmp2_11 = tt.load %tmp2_10 : tensor<512x!tt.ptr> loc(#loc38) + %tmp2_12 = arith.extf %tmp2_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc39) + %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<512xf32> loc(#loc40) + %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<512xf32> loc(#loc41) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc18) + %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc18) + %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc19) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc19) + tt.return loc(#loc20) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19) +#loc2 = loc(unknown) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4) +#loc26 = loc("x0"(#loc1)) +#loc27 = loc("xoffset"(#loc3)) +#loc28 = loc("xoffset"(#loc4)) +#loc29 = loc("xindex"(#loc5)) +#loc30 = loc("xindex"(#loc6)) +#loc31 = loc("tmp0"(#loc7)) +#loc32 = loc("tmp0"(#loc8)) +#loc33 = loc("tmp0"(#loc9)) +#loc34 = loc("tmp1"(#loc10)) +#loc35 = loc("tmp1"(#loc11)) +#loc36 = loc("tmp1"(#loc12)) +#loc37 = loc("tmp2"(#loc13)) +#loc38 = loc("tmp2"(#loc14)) +#loc39 = loc("tmp2"(#loc15)) +#loc40 = loc("tmp3"(#loc16)) +#loc41 = loc("tmp4"(#loc17)) diff --git a/triton/XU5DT2AO5BD5AEHEYGLPP5LRDFHHCUEJT4LGDVLB4STXUGVGHFPA/cuda_utils.cpython-312-x86_64-linux-gnu.so b/triton/XU5DT2AO5BD5AEHEYGLPP5LRDFHHCUEJT4LGDVLB4STXUGVGHFPA/cuda_utils.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..344c2b22830c5f156cc59b0e20ab9b830ae50702 Binary files /dev/null and b/triton/XU5DT2AO5BD5AEHEYGLPP5LRDFHHCUEJT4LGDVLB4STXUGVGHFPA/cuda_utils.cpython-312-x86_64-linux-gnu.so differ diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/__grp__triton_poi_fused_mul_silu_split_0.json b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/__grp__triton_poi_fused_mul_silu_split_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0337de1b10392975c7443b05a10d95409f20e9f7 --- /dev/null +++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/__grp__triton_poi_fused_mul_silu_split_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_mul_silu_split_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.source", "triton_poi_fused_mul_silu_split_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttir", "triton_poi_fused_mul_silu_split_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttgir", "triton_poi_fused_mul_silu_split_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.llir", "triton_poi_fused_mul_silu_split_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ptx", "triton_poi_fused_mul_silu_split_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.cubin", "triton_poi_fused_mul_silu_split_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.json"}} \ No newline at end of file diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.cubin b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c0c97225d45c76ada78d2df90ec7162cd0aba037 Binary files /dev/null and b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.cubin differ diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.json b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b9701a1a7f1da59c13c48181716cc878e0c24145 --- /dev/null +++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.json @@ -0,0 +1 @@ +{"hash": "c6f3131bd32cb47a6710d13823be1ffa9aa7e102a269832edc2d37a1ea3b751d", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_silu_split_0"} \ No newline at end of file diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.llir b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..340f5c2a2f3c09dbf34df6f738929c582a15fd58 --- /dev/null +++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.llir @@ -0,0 +1,102 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_mul_silu_split_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %7 = shl i32 %6, 9, !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 1, !dbg !9 + %10 = and i32 %9, 510, !dbg !9 + %11 = or disjoint i32 %10, %7, !dbg !10 + %12 = srem i32 %11, 12288, !dbg !11 + %13 = sub nsw i32 %11, %12, !dbg !11 + %14 = add i32 %13, %11, !dbg !11 + %15 = sext i32 %14 to i64, !dbg !12 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12 + %17 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %16) #3, !dbg !13 + %18 = bitcast i32 %17 to <2 x bfloat>, !dbg !13 + %19 = add i32 %14, 12288, !dbg !14 + %20 = sext i32 %19 to i64, !dbg !15 + %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !15 + %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %21) #3, !dbg !16 + %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !16 + %24 = sext i32 %11 to i64, !dbg !17 + %25 = getelementptr bfloat, ptr addrspace(1) %1, i64 %24, !dbg !17 + %26 = fpext <2 x bfloat> %18 to <2 x float>, !dbg !18 + %27 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !19 + %28 = extractelement <2 x float> %26, i64 0, !dbg !20 + %29 = fsub float 0.000000e+00, %28, !dbg !20 + %30 = extractelement <2 x float> %26, i64 1, !dbg !20 + %31 = fsub float 0.000000e+00, %30, !dbg !20 + %32 = fmul float %29, 0x3FF7154760000000, !dbg !25 + %33 = tail call float @llvm.nvvm.ex2.approx.f(float %32), !dbg !25 + %34 = fmul float %31, 0x3FF7154760000000, !dbg !25 + %35 = tail call float @llvm.nvvm.ex2.approx.f(float %34), !dbg !25 + %36 = fadd float %33, 1.000000e+00, !dbg !26 + %37 = fadd float %35, 1.000000e+00, !dbg !26 + %38 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %36), !dbg !27 + %39 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %37), !dbg !27 + %40 = insertelement <2 x float> poison, float %38, i64 0, !dbg !28 + %41 = insertelement <2 x float> %40, float %39, i64 1, !dbg !28 + %42 = fmul <2 x float> %41, %26, !dbg !28 + %43 = fmul <2 x float> %42, %27, !dbg !29 + %44 = fptrunc <2 x float> %43 to <2 x bfloat>, !dbg !30 + %45 = bitcast <2 x bfloat> %44 to i32, !dbg !30 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %45, ptr addrspace(1) %25) #3, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_silu_split_0", linkageName: "triton_poi_fused_mul_silu_split_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 35, scope: !4) +!12 = !DILocation(line: 26, column: 30, scope: !4) +!13 = !DILocation(line: 26, column: 46, scope: !4) +!14 = !DILocation(line: 27, column: 43, scope: !4) +!15 = !DILocation(line: 27, column: 30, scope: !4) +!16 = !DILocation(line: 27, column: 54, scope: !4) +!17 = !DILocation(line: 33, column: 25, scope: !4) +!18 = !DILocation(line: 26, column: 55, scope: !4) +!19 = !DILocation(line: 27, column: 63, scope: !4) +!20 = !DILocation(line: 50, column: 30, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !4, file: !22, discriminator: 0) +!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!23 = !DILocation(line: 29, column: 22, scope: !24) +!24 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!25 = !DILocation(line: 50, column: 29, scope: !21, inlinedAt: !23) +!26 = !DILocation(line: 50, column: 20, scope: !21, inlinedAt: !23) +!27 = !DILocation(line: 50, column: 16, scope: !21, inlinedAt: !23) +!28 = !DILocation(line: 30, column: 18, scope: !4) +!29 = !DILocation(line: 32, column: 18, scope: !4) +!30 = !DILocation(line: 33, column: 36, scope: !4) +!31 = !DILocation(line: 33, column: 4, scope: !4) diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ptx b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ce38fb817fb5836290e537af53a923fd6aab8b0f --- /dev/null +++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ptx @@ -0,0 +1,437 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_poi_fused_mul_silu_split_0 // -- Begin function triton_poi_fused_mul_silu_split_0 + // @triton_poi_fused_mul_silu_split_0 +.visible .entry triton_poi_fused_mul_silu_split_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_1, + .param .u32 triton_poi_fused_mul_silu_split_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_4 +) +.reqntid 256 +{ + .reg .b16 %rs<5>; + .reg .b32 %r<36>; + .reg .b64 %rd<6>; + .loc 1 18 0 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_poi_fused_mul_silu_split_0_param_0]; + ld.param.b64 %rd5, [triton_poi_fused_mul_silu_split_0_param_1]; +$L__tmp0: + .loc 1 20 28 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:20:28 + mov.u32 %r4, %ctaid.x; + .loc 1 20 33 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:20:33 + shl.b32 %r5, %r4, 9; + .loc 1 21 36 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:21:36 + mov.u32 %r6, %tid.x; + shl.b32 %r7, %r6, 1; + and.b32 %r8, %r7, 510; + .loc 1 21 23 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:21:23 + or.b32 %r9, %r8, %r5; + .loc 1 26 35 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:35 + mul.hi.s32 %r10, %r9, 715827883; + shr.u32 %r11, %r10, 31; + shr.u32 %r12, %r10, 11; + add.s32 %r13, %r12, %r11; + mad.lo.s32 %r14, %r13, 12288, %r9; + .loc 1 26 30 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:30 + mad.wide.s32 %rd1, %r14, 2, %rd4; + .loc 1 26 46 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:46 + // begin inline asm + mov.u32 %r1, 0x0; + ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + .loc 1 27 43 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:43 + add.s32 %r15, %r14, 12288; + .loc 1 27 30 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:30 + mad.wide.s32 %rd2, %r15, 2, %rd4; + .loc 1 27 54 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:54 + // begin inline asm + mov.u32 %r2, 0x0; + ld.global.b32 { %r2 }, [ %rd2 + 0 ]; + // end inline asm + .loc 1 33 25 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:25 + mad.wide.s32 %rd3, %r9, 2, %rd5; + .loc 1 26 55 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:55 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r16, %rs2; + cvt.f32.bf16 %r17, %rs1; + .loc 1 27 63 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:63 + mov.b32 {%rs3, %rs4}, %r2; + cvt.f32.bf16 %r18, %rs4; + cvt.f32.bf16 %r19, %rs3; + mov.b32 %r20, 0f00000000; +$L__tmp1: + .loc 2 50 30 // standard.py:50:30 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + sub.f32 %r21, %r20, %r17; + sub.f32 %r22, %r20, %r16; + .loc 2 50 29 // standard.py:50:29 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + mul.f32 %r23, %r21, 0f3FB8AA3B; + ex2.approx.f32 %r24, %r23; + mul.f32 %r25, %r22, 0f3FB8AA3B; + ex2.approx.f32 %r26, %r25; + .loc 2 50 20 // standard.py:50:20 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + add.f32 %r27, %r24, 0f3F800000; + add.f32 %r28, %r26, 0f3F800000; + mov.b32 %r29, 0f3F800000; + .loc 2 50 16 // standard.py:50:16 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ] + div.full.f32 %r30, %r29, %r27; + div.full.f32 %r31, %r29, %r28; +$L__tmp2: + .loc 1 30 18 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:30:18 + mul.f32 %r32, %r31, %r16; + mul.f32 %r33, %r30, %r17; + .loc 1 32 18 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:32:18 + mul.f32 %r34, %r33, %r19; + mul.f32 %r35, %r32, %r18; + .loc 1 33 36 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:36 + cvt.rn.bf16x2.f32 %r3, %r35, %r34; + // begin inline asm + st.global.b32 [ %rd3 + 0 ], { %r3 }; + // end inline asm + .loc 1 33 4 // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 307 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 119 +.b8 54 +.b8 115 +.b8 103 +.b8 52 +.b8 118 +.b8 51 +.b8 98 +.b8 99 +.b8 105 +.b8 103 +.b8 104 +.b8 119 +.b8 111 +.b8 107 +.b8 122 +.b8 113 +.b8 54 +.b8 105 +.b8 52 +.b8 51 +.b8 116 +.b8 108 +.b8 53 +.b8 120 +.b8 107 +.b8 53 +.b8 118 +.b8 122 +.b8 55 +.b8 122 +.b8 101 +.b8 118 +.b8 117 +.b8 107 +.b8 55 +.b8 106 +.b8 104 +.b8 118 +.b8 108 +.b8 113 +.b8 121 +.b8 114 +.b8 121 +.b8 121 +.b8 117 +.b8 104 +.b8 117 +.b8 101 +.b8 111 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 54 +.b8 119 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x24 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 111 +.b8 105 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 105 +.b8 108 +.b8 117 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 29 // DW_AT_call_line +.b8 22 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.source b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.source new file mode 100644 index 0000000000000000000000000000000000000000..6103d581e7ff1cb7c9a2381f8ec66e8a755f808a --- /dev/null +++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.source @@ -0,0 +1,129 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0) +#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0) +#loc33 = loc("in_ptr0"(#loc)) +#loc34 = loc("out_ptr0"(#loc)) +#loc35 = loc("xnumel"(#loc)) +#loc58 = loc("x"(#loc26)) +module { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 25165824 : i32 loc(#loc36) + %xoffset = tt.get_program_id x : i32 loc(#loc37) + %xoffset_1 = arith.constant 512 : i32 loc(#loc38) + %xoffset_2 = arith.constant 512 : i32 loc(#loc38) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc38) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc39) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc40) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc40) + %xmask = arith.constant true loc(#loc41) + %xmask_6 = arith.constant dense : tensor<512xi1> loc(#loc41) + %x0 = arith.constant 12288 : i32 loc(#loc42) + %x0_7 = arith.constant 12288 : i32 loc(#loc42) + %x0_8 = arith.constant dense<12288> : tensor<512xi32> loc(#loc42) + %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc42) + %x1 = arith.constant 12288 : i32 loc(#loc43) + %x1_10 = arith.constant 12288 : i32 loc(#loc43) + %x1_11 = arith.constant dense<12288> : tensor<512xi32> loc(#loc43) + %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc43) + %tmp0 = arith.constant 24576 : i32 loc(#loc44) + %tmp0_13 = arith.constant 24576 : i32 loc(#loc44) + %tmp0_14 = arith.constant dense<24576> : tensor<512xi32> loc(#loc44) + %tmp0_15 = arith.muli %tmp0_14, %x1_12 : tensor<512xi32> loc(#loc44) + %tmp0_16 = arith.addi %x0_9, %tmp0_15 : tensor<512xi32> loc(#loc45) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc46) + %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc46) + %tmp0_19 = tt.load %tmp0_18 : tensor<512x!tt.ptr> loc(#loc47) + %tmp0_20 = arith.extf %tmp0_19 : tensor<512xbf16> to tensor<512xf32> loc(#loc48) + %tmp5 = arith.constant 12288 : i32 loc(#loc49) + %tmp5_21 = arith.constant 12288 : i32 loc(#loc49) + %tmp5_22 = arith.constant dense<12288> : tensor<512xi32> loc(#loc49) + %tmp5_23 = arith.addi %tmp5_22, %x0_9 : tensor<512xi32> loc(#loc49) + %tmp5_24 = arith.constant 24576 : i32 loc(#loc50) + %tmp5_25 = arith.constant 24576 : i32 loc(#loc50) + %tmp5_26 = arith.constant dense<24576> : tensor<512xi32> loc(#loc50) + %tmp5_27 = arith.muli %tmp5_26, %x1_12 : tensor<512xi32> loc(#loc50) + %tmp5_28 = arith.addi %tmp5_23, %tmp5_27 : tensor<512xi32> loc(#loc51) + %tmp5_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc52) + %tmp5_30 = tt.addptr %tmp5_29, %tmp5_28 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc52) + %tmp5_31 = tt.load %tmp5_30 : tensor<512x!tt.ptr> loc(#loc53) + %tmp5_32 = arith.extf %tmp5_31 : tensor<512xbf16> to tensor<512xf32> loc(#loc54) + %tmp2 = tt.call @triton.language.standard.sigmoid__fp32S512S__(%tmp0_20) : (tensor<512xf32>) -> tensor<512xf32> loc(#loc55) + %tmp3 = arith.mulf %tmp0_20, %tmp2 : tensor<512xf32> loc(#loc56) + %tmp6 = arith.mulf %tmp3, %tmp5_32 : tensor<512xf32> loc(#loc57) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc23) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc23) + %2 = arith.truncf %tmp6 : tensor<512xf32> to tensor<512xbf16> loc(#loc24) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc24) + tt.return loc(#loc25) + } loc(#loc) + tt.func private @triton.language.standard.sigmoid__fp32S512S__(%x: tensor<512xf32> loc("x"(#loc26))) -> tensor<512xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc27) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc27) + %0 = arith.subf %cst_0, %x : tensor<512xf32> loc(#loc27) + %1 = math.exp %0 : tensor<512xf32> loc(#loc28) + %c1_i32 = arith.constant 1 : i32 loc(#loc29) + %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc29) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc29) + %2 = arith.addf %cst_2, %1 : tensor<512xf32> loc(#loc29) + %c1_i32_3 = arith.constant 1 : i32 loc(#loc30) + %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc30) + %cst_5 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc30) + %3 = arith.divf %cst_5, %2 : tensor<512xf32> loc(#loc30) + tt.return %3 : tensor<512xf32> loc(#loc31) + ^bb1: // no predecessors + %4 = ub.poison : tensor<512xf32> loc(#loc32) + tt.return %4 : tensor<512xf32> loc(#loc32) + } loc(#loc26) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":22:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:49) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4) +#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc28 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc30 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4) +#loc36 = loc("xnumel"(#loc1)) +#loc37 = loc("xoffset"(#loc2)) +#loc38 = loc("xoffset"(#loc3)) +#loc39 = loc("xindex"(#loc4)) +#loc40 = loc("xindex"(#loc5)) +#loc41 = loc("xmask"(#loc6)) +#loc42 = loc("x0"(#loc7)) +#loc43 = loc("x1"(#loc8)) +#loc44 = loc("tmp0"(#loc9)) +#loc45 = loc("tmp0"(#loc10)) +#loc46 = loc("tmp0"(#loc11)) +#loc47 = loc("tmp0"(#loc12)) +#loc48 = loc("tmp0"(#loc13)) +#loc49 = loc("tmp5"(#loc14)) +#loc50 = loc("tmp5"(#loc15)) +#loc51 = loc("tmp5"(#loc16)) +#loc52 = loc("tmp5"(#loc17)) +#loc53 = loc("tmp5"(#loc18)) +#loc54 = loc("tmp5"(#loc19)) +#loc55 = loc("tmp2"(#loc20)) +#loc56 = loc("tmp3"(#loc21)) +#loc57 = loc("tmp6"(#loc22)) diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttgir b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8e0600ab4273e4a88e1c06e6a65318b4a7965298 --- /dev/null +++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttgir @@ -0,0 +1,93 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0) +#loc28 = loc("in_ptr0"(#loc)) +#loc29 = loc("out_ptr0"(#loc)) +#loc30 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<24576> : tensor<512xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<12288> : tensor<512xi32, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc31) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc32) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc33) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc34) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc34) + %x0 = arith.remsi %xindex_5, %cst_0 : tensor<512xi32, #blocked> loc(#loc35) + %x1 = arith.divsi %xindex_5, %cst_0 : tensor<512xi32, #blocked> loc(#loc36) + %tmp0 = arith.muli %x1, %cst : tensor<512xi32, #blocked> loc(#loc37) + %tmp0_6 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc38) + %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc39) + %tmp0_8 = tt.addptr %tmp0_7, %tmp0_6 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc39) + %tmp0_9 = tt.load %tmp0_8 : tensor<512x!tt.ptr, #blocked> loc(#loc40) + %tmp0_10 = arith.extf %tmp0_9 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc41) + %tmp5 = arith.addi %x0, %cst_0 : tensor<512xi32, #blocked> loc(#loc42) + %tmp5_11 = arith.addi %tmp5, %tmp0 : tensor<512xi32, #blocked> loc(#loc43) + %tmp5_12 = tt.addptr %tmp0_7, %tmp5_11 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc44) + %tmp5_13 = tt.load %tmp5_12 : tensor<512x!tt.ptr, #blocked> loc(#loc45) + %tmp5_14 = arith.extf %tmp5_13 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc46) + %tmp2 = arith.subf %cst_1, %tmp0_10 : tensor<512xf32, #blocked> loc(#loc50) + %tmp2_15 = math.exp %tmp2 : tensor<512xf32, #blocked> loc(#loc51) + %tmp2_16 = arith.addf %tmp2_15, %cst_2 : tensor<512xf32, #blocked> loc(#loc52) + %tmp2_17 = arith.divf %cst_2, %tmp2_16 : tensor<512xf32, #blocked> loc(#loc53) + %tmp3 = arith.mulf %tmp0_10, %tmp2_17 : tensor<512xf32, #blocked> loc(#loc48) + %tmp6 = arith.mulf %tmp3, %tmp5_14 : tensor<512xf32, #blocked> loc(#loc49) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc25) + %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc25) + %2 = arith.truncf %tmp6 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc26) + tt.store %1, %2 : tensor<512x!tt.ptr, #blocked> loc(#loc26) + tt.return loc(#loc27) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63) +#loc18 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4) +#loc31 = loc("xoffset"(#loc2)) +#loc32 = loc("xoffset"(#loc3)) +#loc33 = loc("xindex"(#loc4)) +#loc34 = loc("xindex"(#loc5)) +#loc35 = loc("x0"(#loc6)) +#loc36 = loc("x1"(#loc7)) +#loc37 = loc("tmp0"(#loc8)) +#loc38 = loc("tmp0"(#loc9)) +#loc39 = loc("tmp0"(#loc10)) +#loc40 = loc("tmp0"(#loc11)) +#loc41 = loc("tmp0"(#loc12)) +#loc42 = loc("tmp5"(#loc13)) +#loc43 = loc("tmp5"(#loc14)) +#loc44 = loc("tmp5"(#loc15)) +#loc45 = loc("tmp5"(#loc16)) +#loc46 = loc("tmp5"(#loc17)) +#loc47 = loc("tmp2"(#loc19)) +#loc48 = loc("tmp3"(#loc23)) +#loc49 = loc("tmp6"(#loc24)) +#loc50 = loc(callsite(#loc18 at #loc47)) +#loc51 = loc(callsite(#loc20 at #loc47)) +#loc52 = loc(callsite(#loc21 at #loc47)) +#loc53 = loc(callsite(#loc22 at #loc47)) diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttir b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..afe4785041347dbb9318e32fa3fc56ae1cc88555 --- /dev/null +++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttir @@ -0,0 +1,93 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0) +#loc28 = loc("in_ptr0"(#loc)) +#loc29 = loc("out_ptr0"(#loc)) +#loc30 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %tmp2 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc50) + %tmp2_0 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc51) + %cst = arith.constant dense<24576> : tensor<512xi32> loc(#loc3) + %cst_1 = arith.constant dense<12288> : tensor<512xi32> loc(#loc3) + %c512_i32 = arith.constant 512 : i32 loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc32) + %xoffset_2 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc33) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc34) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc35) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc35) + %x0 = arith.remsi %xindex_4, %cst_1 : tensor<512xi32> loc(#loc36) + %x1 = arith.divsi %xindex_4, %cst_1 : tensor<512xi32> loc(#loc37) + %tmp0 = arith.muli %x1, %cst : tensor<512xi32> loc(#loc38) + %tmp0_5 = arith.addi %x0, %tmp0 : tensor<512xi32> loc(#loc39) + %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc40) + %tmp0_7 = tt.addptr %tmp0_6, %tmp0_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc40) + %tmp0_8 = tt.load %tmp0_7 : tensor<512x!tt.ptr> loc(#loc41) + %tmp0_9 = arith.extf %tmp0_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc42) + %tmp5 = arith.addi %x0, %cst_1 : tensor<512xi32> loc(#loc43) + %tmp5_10 = arith.addi %tmp5, %tmp0 : tensor<512xi32> loc(#loc44) + %tmp5_11 = tt.addptr %tmp0_6, %tmp5_10 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc45) + %tmp5_12 = tt.load %tmp5_11 : tensor<512x!tt.ptr> loc(#loc46) + %tmp5_13 = arith.extf %tmp5_12 : tensor<512xbf16> to tensor<512xf32> loc(#loc47) + %tmp2_14 = arith.subf %tmp2, %tmp0_9 : tensor<512xf32> loc(#loc50) + %tmp2_15 = math.exp %tmp2_14 : tensor<512xf32> loc(#loc52) + %tmp2_16 = arith.addf %tmp2_15, %tmp2_0 : tensor<512xf32> loc(#loc53) + %tmp2_17 = arith.divf %tmp2_0, %tmp2_16 : tensor<512xf32> loc(#loc54) + %tmp3 = arith.mulf %tmp0_9, %tmp2_17 : tensor<512xf32> loc(#loc48) + %tmp6 = arith.mulf %tmp3, %tmp5_13 : tensor<512xf32> loc(#loc49) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc25) + %1 = tt.addptr %0, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc25) + %2 = arith.truncf %tmp6 : tensor<512xf32> to tensor<512xbf16> loc(#loc26) + tt.store %1, %2 : tensor<512x!tt.ptr> loc(#loc26) + tt.return loc(#loc27) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22) +#loc3 = loc(unknown) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29) +#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4) +#loc31 = loc("tmp2"(#loc2)) +#loc32 = loc("xoffset"(#loc4)) +#loc33 = loc("xoffset"(#loc5)) +#loc34 = loc("xindex"(#loc6)) +#loc35 = loc("xindex"(#loc7)) +#loc36 = loc("x0"(#loc8)) +#loc37 = loc("x1"(#loc9)) +#loc38 = loc("tmp0"(#loc10)) +#loc39 = loc("tmp0"(#loc11)) +#loc40 = loc("tmp0"(#loc12)) +#loc41 = loc("tmp0"(#loc13)) +#loc42 = loc("tmp0"(#loc14)) +#loc43 = loc("tmp5"(#loc15)) +#loc44 = loc("tmp5"(#loc16)) +#loc45 = loc("tmp5"(#loc17)) +#loc46 = loc("tmp5"(#loc18)) +#loc47 = loc("tmp5"(#loc19)) +#loc48 = loc("tmp3"(#loc23)) +#loc49 = loc("tmp6"(#loc24)) +#loc50 = loc(callsite(#loc1 at #loc31)) +#loc51 = loc(callsite(#loc3 at #loc31)) +#loc52 = loc(callsite(#loc20 at #loc31)) +#loc53 = loc(callsite(#loc21 at #loc31)) +#loc54 = loc(callsite(#loc22 at #loc31)) diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/__grp__triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..08b02f890088594cfc35b1134a496ca1b1895c58 --- /dev/null +++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/__grp__triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.json"}} \ No newline at end of file diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a4b30fba142c80477a85634e7287900bd3e88627 Binary files /dev/null and b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.cubin differ diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.json b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2040ecea201391605f9d5162a87576c64e851aa3 --- /dev/null +++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.json @@ -0,0 +1 @@ +{"hash": "c4cc8035fbb91452869d41f5db1b945e18a038a0db3b2fb2ef809320338cf27a", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 16, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"} \ No newline at end of file diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.llir b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..ef2490914e89973042e5ef7b80b78646f89473c2 --- /dev/null +++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.llir @@ -0,0 +1,136 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 2, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 96, !dbg !9 + %11 = lshr exact i32 %10, 5, !dbg !9 + %12 = and i32 %9, 3, !dbg !9 + %13 = or disjoint i32 %11, %8, !dbg !10 + %14 = or disjoint i32 %8, %12, !dbg !10 + %15 = shl nuw nsw i32 %9, 2, !dbg !11 + %16 = and i32 %15, 124, !dbg !11 + %17 = sdiv i32 %13, 32, !dbg !12 + %18 = mul i32 %17, 32, !dbg !13 + %.decomposed = sub i32 %13, %18, !dbg !13 + %19 = shl nsw i32 %.decomposed, 7, !dbg !14 + %20 = or disjoint i32 %19, %16, !dbg !15 + %21 = mul i32 %17, 12288, !dbg !16 + %22 = add i32 %20, %21, !dbg !17 + %23 = sext i32 %22 to i64, !dbg !18 + %24 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18 + %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %24, i64 %25, i1 true) #4, !dbg !19 + %27 = extractvalue { i32, i32 } %26, 0, !dbg !19 + %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !19 + %29 = extractvalue { i32, i32 } %26, 1, !dbg !19 + %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !19 + %31 = extractelement <2 x bfloat> %28, i64 0, !dbg !19 + %32 = extractelement <2 x bfloat> %28, i64 1, !dbg !19 + %33 = extractelement <2 x bfloat> %30, i64 0, !dbg !19 + %34 = extractelement <2 x bfloat> %30, i64 1, !dbg !19 + %35 = fpext bfloat %31 to float, !dbg !20 + %36 = fpext bfloat %32 to float, !dbg !20 + %37 = fpext bfloat %33 to float, !dbg !20 + %38 = fpext bfloat %34 to float, !dbg !20 + %39 = fmul float %35, %35, !dbg !21 + %40 = fmul float %36, %36, !dbg !21 + %41 = fmul float %37, %37, !dbg !21 + %42 = fmul float %38, %38, !dbg !21 + %43 = fadd float %39, %40, !dbg !22 + %44 = fadd float %41, %43, !dbg !22 + %45 = fadd float %42, %44, !dbg !22 + %46 = bitcast float %45 to i32, !dbg !25 + %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %46, i32 16, i32 31), !dbg !25 + %48 = bitcast i32 %47 to float, !dbg !25 + %49 = fadd float %45, %48, !dbg !22 + %50 = bitcast float %49 to i32, !dbg !25 + %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %50, i32 8, i32 31), !dbg !25 + %52 = bitcast i32 %51 to float, !dbg !25 + %53 = fadd float %49, %52, !dbg !22 + %54 = bitcast float %53 to i32, !dbg !25 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 4, i32 31), !dbg !25 + %56 = bitcast i32 %55 to float, !dbg !25 + %57 = fadd float %53, %56, !dbg !22 + %58 = bitcast float %57 to i32, !dbg !25 + %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %58, i32 2, i32 31), !dbg !25 + %60 = bitcast i32 %59 to float, !dbg !25 + %61 = fadd float %57, %60, !dbg !22 + %62 = bitcast float %61 to i32, !dbg !25 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 1, i32 31), !dbg !25 + %64 = bitcast i32 %63 to float, !dbg !25 + %65 = fadd float %61, %64, !dbg !22 + %66 = lshr exact i32 %10, 3, !dbg !28 + %67 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %66, !dbg !28 + store float %65, ptr addrspace(3) %67, align 4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %68 = shl nuw nsw i32 %12, 2, !dbg !28 + %69 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %68, !dbg !28 + %70 = load i32, ptr addrspace(3) %69, align 4, !dbg !28 + %71 = sext i32 %14 to i64, !dbg !29 + %72 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !29 + %73 = and i32 %9, 124, !dbg !30 + %74 = icmp eq i32 %73, 0, !dbg !30 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %70, ptr addrspace(1) %72, i1 %74) #4, !dbg !30 + ret void, !dbg !31 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 38, column: 45, scope: !4) +!15 = !DILocation(line: 38, column: 41, scope: !4) +!16 = !DILocation(line: 38, column: 56, scope: !4) +!17 = !DILocation(line: 38, column: 50, scope: !4) +!18 = !DILocation(line: 38, column: 34, scope: !4) +!19 = !DILocation(line: 38, column: 61, scope: !4) +!20 = !DILocation(line: 38, column: 115, scope: !4) +!21 = !DILocation(line: 40, column: 22, scope: !4) +!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26) +!26 = !DILocation(line: 44, column: 25, scope: !27) +!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0) +!28 = !DILocation(line: 44, column: 28, scope: !4) +!29 = !DILocation(line: 45, column: 25, scope: !4) +!30 = !DILocation(line: 45, column: 36, scope: !4) +!31 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..3cccf1d41c7734e1ec2434a7569069bd3403d5ab --- /dev/null +++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ptx @@ -0,0 +1,506 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__fused_rms_norm_view_1 +.visible .entry triton_red_fused__fused_rms_norm_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_2, + .param .u32 triton_red_fused__fused_rms_norm_view_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5 +) +.reqntid 128 +{ + .reg .pred %p<3>; + .reg .b16 %rs<5>; + .reg .b32 %r<48>; + .reg .b64 %rd<6>; + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0 + +// %bb.0: + ld.param.b64 %rd4, [triton_red_fused__fused_rms_norm_view_1_param_0]; + ld.param.b64 %rd5, [triton_red_fused__fused_rms_norm_view_1_param_1]; +$L__tmp0: + .loc 1 23 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28 + mov.u32 %r5, %ctaid.x; + .loc 1 23 33 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33 + shl.b32 %r6, %r5, 2; + .loc 1 24 44 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44 + mov.u32 %r7, %tid.x; + and.b32 %r8, %r7, 96; + bfe.u32 %r9, %r7, 5, 2; + and.b32 %r10, %r7, 3; + .loc 1 24 23 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23 + or.b32 %r11, %r9, %r6; + or.b32 %r12, %r6, %r10; + .loc 1 26 37 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37 + shl.b32 %r13, %r7, 2; + and.b32 %r14, %r13, 124; + .loc 1 29 19 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19 + bfe.s32 %r15, %r5, 29, 1; + shr.u32 %r16, %r15, 27; + add.s32 %r17, %r11, %r16; + shr.u32 %r18, %r17, 5; + .loc 1 28 19 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:28:19 + and.b32 %r19, %r17, 33554400; + sub.s32 %r20, %r11, %r19; + .loc 1 38 45 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:45 + shl.b32 %r21, %r20, 7; + .loc 1 38 41 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:41 + or.b32 %r22, %r21, %r14; + .loc 1 38 50 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:50 + mad.lo.s32 %r23, %r18, 12288, %r22; + .loc 1 38 34 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34 + mad.wide.s32 %rd1, %r23, 2, %rd4; + .loc 1 38 61 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61 + // begin inline asm + mov.u64 %rd2, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + .loc 1 38 115 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115 + cvt.f32.bf16 %r24, %rs1; + cvt.f32.bf16 %r25, %rs2; + cvt.f32.bf16 %r26, %rs3; + cvt.f32.bf16 %r27, %rs4; + .loc 1 40 22 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22 + mul.f32 %r28, %r25, %r25; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + fma.rn.f32 %r29, %r24, %r24, %r28; + fma.rn.f32 %r30, %r26, %r26, %r29; + fma.rn.f32 %r31, %r27, %r27, %r30; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r32, %r31, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r33, %r31, %r32; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r34, %r33, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r35, %r33, %r34; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r36, %r35, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r37, %r35, %r36; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r39, %r37, %r38; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] + shfl.sync.bfly.b32 %r40, %r39, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ] + add.f32 %r41, %r39, %r40; +$L__tmp12: + .loc 1 44 28 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28 + shr.u32 %r42, %r8, 3; + mov.b32 %r43, global_smem; + add.s32 %r44, %r43, %r42; + st.shared.b32 [%r44], %r41; + bar.sync 0; + shl.b32 %r45, %r10, 2; + add.s32 %r46, %r43, %r45; + ld.shared.b32 %r4, [%r46]; + .loc 1 45 25 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25 + mad.wide.s32 %rd3, %r12, 4, %rd5; + .loc 1 45 36 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36 + and.b32 %r47, %r7, 124; + setp.eq.b32 %p2, %r47, 0; + // begin inline asm + @%p2 st.global.b32 [ %rd3 + 0 ], { %r4 }; + // end inline asm + .loc 1 45 4 // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4 + ret; +$L__tmp13: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 339 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 113 +.b8 105 +.b8 116 +.b8 120 +.b8 53 +.b8 104 +.b8 119 +.b8 117 +.b8 112 +.b8 107 +.b8 98 +.b8 106 +.b8 109 +.b8 99 +.b8 115 +.b8 111 +.b8 121 +.b8 107 +.b8 113 +.b8 101 +.b8 112 +.b8 122 +.b8 113 +.b8 99 +.b8 55 +.b8 122 +.b8 99 +.b8 120 +.b8 106 +.b8 99 +.b8 98 +.b8 53 +.b8 97 +.b8 99 +.b8 113 +.b8 107 +.b8 105 +.b8 55 +.b8 122 +.b8 99 +.b8 115 +.b8 106 +.b8 105 +.b8 102 +.b8 114 +.b8 110 +.b8 114 +.b8 122 +.b8 99 +.b8 114 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 113 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.source b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..d008d1290cf63a8a576c3fae2b114c150f51189b --- /dev/null +++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.source @@ -0,0 +1,167 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc35 = loc(unknown) +#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc74 = loc("input"(#loc33)) +#loc75 = loc("a"(#loc38)) +#loc76 = loc("b"(#loc38)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 65536 : i32 loc(#loc46) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_2 = arith.constant 4 : i32 loc(#loc49) + %xoffset_3 = arith.constant 4 : i32 loc(#loc49) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc50) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc51) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc52) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc52) + %xmask = arith.constant true loc(#loc53) + %xmask_8 = arith.constant dense : tensor<4x128xi1> loc(#loc53) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55) + %x0 = arith.constant 32 : i32 loc(#loc56) + %x0_10 = arith.constant 32 : i32 loc(#loc56) + %x0_11 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc56) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<4x1xi32> loc(#loc56) + %x1 = arith.constant 32 : i32 loc(#loc57) + %x1_13 = arith.constant 32 : i32 loc(#loc57) + %x1_14 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc57) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<4x1xi32> loc(#loc57) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc58) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c128_i32 = arith.constant 128 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<4x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60) + %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61) + %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61) + %tmp0 = arith.constant 128 : i32 loc(#loc62) + %tmp0_22 = arith.constant 128 : i32 loc(#loc62) + %tmp0_23 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc62) + %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<4x1xi32> loc(#loc62) + %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc63) + %tmp0_26 = tt.broadcast %tmp0_24 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc63) + %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<4x128xi32> loc(#loc63) + %tmp0_28 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_29 = arith.constant 12288 : i32 loc(#loc64) + %tmp0_30 = arith.constant dense<12288> : tensor<4x1xi32> loc(#loc64) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<4x1xi32> loc(#loc64) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc65) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<4x128xi32> loc(#loc65) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc66) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc66) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67) + %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc67) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc67) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc67) + %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc67) + %tmp0_41 = arith.extf %tmp0_40 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<4x128xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<4x128xf32> loc(#loc70) + %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc71) + %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc71) + scf.yield %_tmp4_43 : tensor<4x128xf32> loc(#loc27) + } loc(#loc59) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc72) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc73) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<4x1x!tt.ptr> loc(#loc30) + %5 = tt.addptr %4, %xindex_7 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> loc(#loc30) + tt.store %5, %tmp4_18 : tensor<4x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x128xf32> loc("input"(#loc33))) -> tensor<4xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34) + tt.reduce.return %2 : f32 loc(#loc34) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc34) + tt.return %0 : tensor<4xf32> loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4xf32> loc(#loc37) + tt.return %1 : tensor<4xf32> loc(#loc37) + } loc(#loc33) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc39) + tt.return %0 : f32 loc(#loc40) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc41) + tt.return %1 : f32 loc(#loc41) + } loc(#loc38) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("r0_numel"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xoffset"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xindex"(#loc7)) +#loc53 = loc("xmask"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("r0_base"(#loc10)) +#loc56 = loc("x0"(#loc11)) +#loc57 = loc("x1"(#loc12)) +#loc58 = loc("_tmp4"(#loc13)) +#loc59 = loc("_tmp4"(#loc14)) +#loc60 = loc("r0_index"(#loc15)) +#loc61 = loc("r0_mask"(#loc16)) +#loc62 = loc("tmp0"(#loc17)) +#loc63 = loc("tmp0"(#loc18)) +#loc64 = loc("tmp0"(#loc19)) +#loc65 = loc("tmp0"(#loc20)) +#loc66 = loc("tmp0"(#loc21)) +#loc67 = loc("tmp0"(#loc22)) +#loc68 = loc("tmp0"(#loc23)) +#loc69 = loc("tmp2"(#loc24)) +#loc70 = loc("tmp5"(#loc25)) +#loc71 = loc("_tmp4"(#loc26)) +#loc72 = loc("tmp4"(#loc28)) +#loc73 = loc("tmp4"(#loc29)) diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..c51f2eca3243f828f06f4f54c84c586d715a2d14 --- /dev/null +++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttgir @@ -0,0 +1,108 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc1 = loc(unknown) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc27 = loc("in_ptr0"(#loc)) +#loc28 = loc("out_ptr0"(#loc)) +#loc29 = loc("xnumel"(#loc)) +#loc30 = loc("r0_numel"(#loc)) +#loc49 = loc("tmp4"(#loc21)) +#loc52 = loc(callsite(#loc1 at #loc49)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<12288> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<4x1xi32, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc31) + %xoffset_5 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc32) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33) + %xindex_6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc33) + %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc33) + %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<4x1xi32, #blocked> loc(#loc34) + %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc34) + %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<4x1xi32, #blocked> loc(#loc34) + %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<4x1xi32, #blocked1> loc(#loc34) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35) + %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35) + %x0 = arith.remsi %xindex_11, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc36) + %x1 = arith.divsi %xindex_11, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc37) + %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38) + %tmp0 = arith.muli %x0, %cst_0 : tensor<4x1xi32, #blocked> loc(#loc39) + %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc40) + %tmp0_15 = tt.broadcast %tmp0 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc40) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<4x128xi32, #blocked> loc(#loc40) + %tmp0_17 = arith.muli %x1, %cst_1 : tensor<4x1xi32, #blocked> loc(#loc41) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc42) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<4x128xi32, #blocked> loc(#loc42) + %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked> loc(#loc43) + %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc43) + %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc44) + %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<4x128x!tt.ptr, #blocked> loc(#loc44) + %tmp0_24 = arith.extf %tmp0_23 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc45) + %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<4x128xf32, #blocked> loc(#loc46) + %tmp5 = arith.addf %tmp2, %cst_4 : tensor<4x128xf32, #blocked> loc(#loc47) + %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc48) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))): + %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53) + tt.reduce.return %tmp4_29 : f32 loc(#loc51) + }) : (tensor<4x128xf32, #blocked>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51) + %tmp4_25 = ttg.convert_layout %tmp4 : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50) + %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc50) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<4x1x!tt.ptr, #blocked1> loc(#loc24) + %1 = tt.addptr %0, %xindex_12 : tensor<4x1x!tt.ptr, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc24) + tt.store %1, %tmp4_26 : tensor<4x1x!tt.ptr, #blocked1> loc(#loc25) + tt.return loc(#loc26) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc31 = loc("xoffset"(#loc2)) +#loc32 = loc("xoffset"(#loc3)) +#loc33 = loc("xindex"(#loc4)) +#loc34 = loc("xindex"(#loc5)) +#loc35 = loc("r0_base"(#loc6)) +#loc36 = loc("x0"(#loc7)) +#loc37 = loc("x1"(#loc8)) +#loc38 = loc("r0_mask"(#loc9)) +#loc39 = loc("tmp0"(#loc10)) +#loc40 = loc("tmp0"(#loc11)) +#loc41 = loc("tmp0"(#loc12)) +#loc42 = loc("tmp0"(#loc13)) +#loc43 = loc("tmp0"(#loc14)) +#loc44 = loc("tmp0"(#loc15)) +#loc45 = loc("tmp0"(#loc16)) +#loc46 = loc("tmp2"(#loc17)) +#loc47 = loc("tmp5"(#loc18)) +#loc48 = loc("_tmp4"(#loc19)) +#loc50 = loc("tmp4"(#loc23)) +#loc51 = loc(callsite(#loc20 at #loc49)) +#loc53 = loc(callsite(#loc22 at #loc51)) diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a4e88a203c59a9ec64867fc191d00a05bfdcc09b --- /dev/null +++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttir @@ -0,0 +1,105 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0) +#loc2 = loc(unknown) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25) +#loc29 = loc("in_ptr0"(#loc)) +#loc30 = loc("out_ptr0"(#loc)) +#loc31 = loc("xnumel"(#loc)) +#loc32 = loc("r0_numel"(#loc)) +#loc53 = loc("tmp4"(#loc23)) +#loc56 = loc(callsite(#loc2 at #loc53)) +module { + tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16> loc(#loc33) + %cst = arith.constant dense<12288> : tensor<4x1xi32> loc(#loc2) + %cst_0 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc2) + %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc2) + %cst_3 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc2) + %c4_i32 = arith.constant 4 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc34) + %xoffset_4 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc35) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc36) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc37) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc38) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc38) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40) + %x0 = arith.remsi %xindex_7, %cst_3 : tensor<4x1xi32> loc(#loc41) + %x1 = arith.divsi %xindex_7, %cst_3 : tensor<4x1xi32> loc(#loc42) + %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43) + %tmp0_9 = arith.muli %x0, %cst_0 : tensor<4x1xi32> loc(#loc44) + %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc45) + %tmp0_11 = tt.broadcast %tmp0_9 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc45) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<4x128xi32> loc(#loc45) + %tmp0_13 = arith.muli %x1, %cst : tensor<4x1xi32> loc(#loc46) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc47) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<4x128xi32> loc(#loc47) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc48) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc48) + %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc33) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc33) + %tmp0_20 = arith.extf %tmp0_19 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc49) + %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<4x128xf32> loc(#loc50) + %tmp5 = arith.addf %tmp2, %cst_2 : tensor<4x128xf32> loc(#loc51) + %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc52) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))): + %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57) + tt.reduce.return %tmp4_24 : f32 loc(#loc55) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc55) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc54) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<4x1x!tt.ptr> loc(#loc26) + %1 = tt.addptr %0, %xindex_7 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> loc(#loc26) + tt.store %1, %tmp4_21 : tensor<4x1x!tt.ptr> loc(#loc27) + tt.return loc(#loc28) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40) +#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4) +#loc33 = loc("tmp0"(#loc1)) +#loc34 = loc("xoffset"(#loc3)) +#loc35 = loc("xoffset"(#loc4)) +#loc36 = loc("xindex"(#loc5)) +#loc37 = loc("xindex"(#loc6)) +#loc38 = loc("xindex"(#loc7)) +#loc39 = loc("r0_base"(#loc8)) +#loc40 = loc("r0_base"(#loc9)) +#loc41 = loc("x0"(#loc10)) +#loc42 = loc("x1"(#loc11)) +#loc43 = loc("r0_mask"(#loc12)) +#loc44 = loc("tmp0"(#loc13)) +#loc45 = loc("tmp0"(#loc14)) +#loc46 = loc("tmp0"(#loc15)) +#loc47 = loc("tmp0"(#loc16)) +#loc48 = loc("tmp0"(#loc17)) +#loc49 = loc("tmp0"(#loc18)) +#loc50 = loc("tmp2"(#loc19)) +#loc51 = loc("tmp5"(#loc20)) +#loc52 = loc("_tmp4"(#loc21)) +#loc54 = loc("tmp4"(#loc25)) +#loc55 = loc(callsite(#loc22 at #loc53)) +#loc57 = loc(callsite(#loc24 at #loc55)) diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7c23b6862e6f9974cc6a5ac9efa3c882e9db6f61 --- /dev/null +++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}} \ No newline at end of file diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d5cc87e55e0fff582530c02def97753686a3694b Binary files /dev/null and b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9696e3636d7e53ec7dfac85e4e4291c7dd2fc6e7 --- /dev/null +++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json @@ -0,0 +1 @@ +{"hash": "c4d2c743a45695d953ff668c5df14e4d1263c6479b49b1fd4aa8f2c88a3f17f7", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"} \ No newline at end of file diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..df248f684e0ede3c641c5469aa4e3e660c212bcd --- /dev/null +++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir @@ -0,0 +1,666 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %13 = shl i32 %12, 3, !dbg !9 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %15 = and i32 %14, 224, !dbg !10 + %16 = lshr exact i32 %15, 5, !dbg !10 + %17 = or disjoint i32 %16, %13, !dbg !11 + %18 = shl nuw nsw i32 %14, 1, !dbg !12 + %19 = and i32 %18, 62, !dbg !12 + %20 = sdiv i32 %17, 32, !dbg !13 + %21 = shl i32 %17, 7 + %22 = shl i32 %20, 15 + %23 = add i32 %22, %21 + %24 = add i32 %23, 4096 + %25 = zext nneg i32 %19 to i64, !dbg !14 + %26 = or disjoint i32 %24, %19, !dbg !15 + %27 = sext i32 %26 to i64, !dbg !16 + %28 = getelementptr bfloat, ptr addrspace(1) %2, i64 %27, !dbg !16 + %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %30 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %28, i64 %29, i1 true) #6, !dbg !17 + %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !17 + %32 = extractelement <2 x bfloat> %31, i64 0, !dbg !17 + %33 = extractelement <2 x bfloat> %31, i64 1, !dbg !17 + %34 = fpext bfloat %32 to float, !dbg !18 + %35 = fpext bfloat %33 to float, !dbg !18 + %36 = or disjoint i32 %23, %19, !dbg !19 + %37 = sext i32 %36 to i64, !dbg !20 + %38 = getelementptr bfloat, ptr addrspace(1) %2, i64 %37, !dbg !20 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %40 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %38, i64 %39, i1 true) #6, !dbg !21 + %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !21 + %42 = extractelement <2 x bfloat> %41, i64 0, !dbg !21 + %43 = extractelement <2 x bfloat> %41, i64 1, !dbg !21 + %44 = fpext bfloat %42 to float, !dbg !22 + %45 = fpext bfloat %43 to float, !dbg !22 + %46 = fmul float %34, %34, !dbg !23 + %47 = fmul float %35, %35, !dbg !23 + %48 = fmul float %44, %44, !dbg !24 + %49 = fmul float %45, %45, !dbg !24 + %50 = or disjoint i32 %19, 64, !dbg !25 + %51 = or disjoint i32 %24, %50, !dbg !15 + %52 = sext i32 %51 to i64, !dbg !16 + %53 = getelementptr bfloat, ptr addrspace(1) %2, i64 %52, !dbg !16 + %54 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17 + %55 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %53, i64 %54, i1 true) #6, !dbg !17 + %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !17 + %57 = extractelement <2 x bfloat> %56, i64 0, !dbg !17 + %58 = extractelement <2 x bfloat> %56, i64 1, !dbg !17 + %59 = fpext bfloat %57 to float, !dbg !18 + %60 = fpext bfloat %58 to float, !dbg !18 + %61 = or disjoint i32 %23, %50, !dbg !19 + %62 = sext i32 %61 to i64, !dbg !20 + %63 = getelementptr bfloat, ptr addrspace(1) %2, i64 %62, !dbg !20 + %64 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21 + %65 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %63, i64 %64, i1 true) #6, !dbg !21 + %66 = bitcast i32 %65 to <2 x bfloat>, !dbg !21 + %67 = extractelement <2 x bfloat> %66, i64 0, !dbg !21 + %68 = extractelement <2 x bfloat> %66, i64 1, !dbg !21 + %69 = fpext bfloat %67 to float, !dbg !22 + %70 = fpext bfloat %68 to float, !dbg !22 + %71 = fmul float %59, %59, !dbg !23 + %72 = fmul float %60, %60, !dbg !23 + %73 = fadd float %46, %71, !dbg !26 + %74 = fadd float %47, %72, !dbg !26 + %75 = fmul float %69, %69, !dbg !24 + %76 = fmul float %70, %70, !dbg !24 + %77 = fadd float %48, %75, !dbg !27 + %78 = fadd float %49, %76, !dbg !27 + %79 = and i32 %14, 7, !dbg !10 + %80 = or disjoint i32 %13, %79, !dbg !11 + %81 = and i32 %14, 248, !dbg !12 + %82 = lshr exact i32 %81, 3, !dbg !12 + %83 = sdiv i32 %80, 32, !dbg !13 + %84 = fadd float %73, %74, !dbg !28 + %85 = bitcast float %84 to i32, !dbg !31 + %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 16, i32 31), !dbg !31 + %87 = bitcast i32 %86 to float, !dbg !31 + %88 = fadd float %84, %87, !dbg !28 + %89 = bitcast float %88 to i32, !dbg !31 + %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 8, i32 31), !dbg !31 + %91 = bitcast i32 %90 to float, !dbg !31 + %92 = fadd float %88, %91, !dbg !28 + %93 = bitcast float %92 to i32, !dbg !31 + %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 4, i32 31), !dbg !31 + %95 = bitcast i32 %94 to float, !dbg !31 + %96 = fadd float %92, %95, !dbg !28 + %97 = bitcast float %96 to i32, !dbg !31 + %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 2, i32 31), !dbg !31 + %99 = bitcast i32 %98 to float, !dbg !31 + %100 = fadd float %96, %99, !dbg !28 + %101 = bitcast float %100 to i32, !dbg !31 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 1, i32 31), !dbg !31 + %103 = bitcast i32 %102 to float, !dbg !31 + %104 = fadd float %100, %103, !dbg !28 + %105 = fadd float %77, %78, !dbg !34 + %106 = bitcast float %105 to i32, !dbg !35 + %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !35 + %108 = bitcast i32 %107 to float, !dbg !35 + %109 = fadd float %105, %108, !dbg !34 + %110 = bitcast float %109 to i32, !dbg !35 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 8, i32 31), !dbg !35 + %112 = bitcast i32 %111 to float, !dbg !35 + %113 = fadd float %109, %112, !dbg !34 + %114 = bitcast float %113 to i32, !dbg !35 + %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 4, i32 31), !dbg !35 + %116 = bitcast i32 %115 to float, !dbg !35 + %117 = fadd float %113, %116, !dbg !34 + %118 = bitcast float %117 to i32, !dbg !35 + %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 2, i32 31), !dbg !35 + %120 = bitcast i32 %119 to float, !dbg !35 + %121 = fadd float %117, %120, !dbg !34 + %122 = bitcast float %121 to i32, !dbg !35 + %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 1, i32 31), !dbg !35 + %124 = bitcast i32 %123 to float, !dbg !35 + %125 = fadd float %121, %124, !dbg !34 + %126 = shl i32 %20, 7, !dbg !37 + %127 = tail call float @llvm.nvvm.div.full(float %125, float 1.280000e+02), !dbg !38 + %128 = fadd float %127, 0x3EB0C6F7A0000000, !dbg !39 + %129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %.not.i = icmp eq i32 %129, 0, !dbg !40 + br i1 %.not.i, label %132, label %130, !dbg !40 + +130: ; preds = %11 + %131 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit, !dbg !40 + +132: ; preds = %11 + %133 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit, !dbg !40 + +__nv_rsqrtf.exit: ; preds = %130, %132 + %.0.i = phi float [ %131, %130 ], [ %133, %132 ], !dbg !40 + %134 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40 + %.not.i3 = icmp eq i32 %134, 0, !dbg !40 + br i1 %.not.i3, label %137, label %135, !dbg !40 + +135: ; preds = %__nv_rsqrtf.exit + %136 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit5, !dbg !40 + +137: ; preds = %__nv_rsqrtf.exit + %138 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40 + br label %__nv_rsqrtf.exit5, !dbg !40 + +__nv_rsqrtf.exit5: ; preds = %135, %137 + %.0.i4 = phi float [ %136, %135 ], [ %138, %137 ], !dbg !40 + %139 = lshr exact i32 %15, 3, !dbg !41 + %140 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %139, !dbg !41 + store float %.0.i, ptr addrspace(3) %140, align 4, !dbg !41 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41 + %141 = shl nuw nsw i32 %79, 2, !dbg !41 + %142 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %141, !dbg !41 + %143 = load float, ptr addrspace(3) %142, align 4, !dbg !41 + %144 = tail call float @llvm.nvvm.div.full(float %104, float 1.280000e+02), !dbg !42 + %145 = fadd float %144, 0x3EB0C6F7A0000000, !dbg !43 + %146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i6 = icmp eq i32 %146, 0, !dbg !44 + br i1 %.not.i6, label %149, label %147, !dbg !44 + +147: ; preds = %__nv_rsqrtf.exit5 + %148 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit8, !dbg !44 + +149: ; preds = %__nv_rsqrtf.exit5 + %150 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit8, !dbg !44 + +__nv_rsqrtf.exit8: ; preds = %147, %149 + %.0.i7 = phi float [ %148, %147 ], [ %150, %149 ], !dbg !44 + %151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i9 = icmp eq i32 %151, 0, !dbg !44 + br i1 %.not.i9, label %154, label %152, !dbg !44 + +152: ; preds = %__nv_rsqrtf.exit8 + %153 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit11, !dbg !44 + +154: ; preds = %__nv_rsqrtf.exit8 + %155 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44 + br label %__nv_rsqrtf.exit11, !dbg !44 + +__nv_rsqrtf.exit11: ; preds = %152, %154 + %.0.i10 = phi float [ %153, %152 ], [ %155, %154 ], !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + store float %.0.i7, ptr addrspace(3) %140, align 4, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %156 = load float, ptr addrspace(3) %142, align 4, !dbg !45 + %157 = shl i32 %17, 7, !dbg !46 + %158 = and i32 %82, 1 + %.masked = and i32 %82, 30 + %159 = and i32 %14, 15 + %160 = shl nuw nsw i32 %159, 3 + %161 = shl nuw nsw i32 %15, 2 + %162 = lshr exact i32 %15, 1 + %163 = lshr i32 %14, 2 + %164 = and i32 %163, 4 + %165 = or disjoint i32 %160, %161 + %166 = xor i32 %165, %162 + %167 = or disjoint i32 %166, %164 + %168 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %167 + %169 = xor i32 %167, 1028 + %170 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %169 + %171 = shl nuw nsw i32 %159, 7 + %172 = shl nuw nsw i32 %79, 4 + %173 = lshr exact i32 %81, 1 + %174 = xor i32 %172, %173 + %175 = or disjoint i32 %174, %171 + %176 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %175 + %177 = xor i32 %175, 4 + %178 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %177 + %179 = icmp eq i32 %158, 0 + %180 = shl i32 %80, 7 + %181 = shl i32 %83, 15 + %182 = add i32 %181, %180 + %183 = icmp ne i32 %158, 0 + %184 = add i32 %182, 4097 + %185 = add i32 %182, 4096 + %186 = shl nuw nsw i32 %79, 7 + %187 = lshr i32 %14, 1 + %188 = and i32 %187, 12 + %189 = and i32 %163, 48 + %190 = lshr i32 %14, 4 + %191 = and i32 %190, 2 + %192 = or disjoint i32 %186, %191 + %193 = or disjoint i32 %172, %188 + %194 = xor i32 %193, %189 + %195 = or disjoint i32 %194, %192 + %196 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %195 + %197 = xor i32 %195, 64 + %198 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %197 + %199 = shl nuw nsw i32 %14, 2 + %200 = and i32 %199, 1008 + %201 = shl nuw nsw i32 %14, 3 + %202 = and i32 %201, 8 + %203 = and i32 %14, 2 + %204 = xor i32 %200, %162 + %205 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %202 + %206 = getelementptr inbounds nuw i8, ptr addrspace(3) %205, i32 %203 + %207 = getelementptr inbounds nuw i8, ptr addrspace(3) %206, i32 %204 + %208 = getelementptr inbounds nuw i8, ptr addrspace(3) %207, i32 4 + %209 = zext nneg i32 %.masked to i64, !dbg !47 + %210 = sext i32 %126 to i64, !dbg !47 + %211 = sext i32 %157 to i64, !dbg !47 + br label %212, !dbg !47 + +212: ; preds = %__nv_rsqrtf.exit11, %212 + %213 = phi i1 [ true, %__nv_rsqrtf.exit11 ], [ false, %212 ] + %indvars.iv = phi i64 [ 0, %__nv_rsqrtf.exit11 ], [ 64, %212 ] + %214 = or disjoint i64 %indvars.iv, %25, !dbg !48 + %215 = or disjoint i64 %indvars.iv, %209, !dbg !49 + %216 = or disjoint i64 %215, 32, !dbg !49 + %217 = trunc nuw nsw i64 %214 to i32, !dbg !50 + %218 = or disjoint i32 %23, %217, !dbg !50 + %219 = sext i32 %218 to i64, !dbg !51 + %220 = getelementptr bfloat, ptr addrspace(1) %2, i64 %219, !dbg !51 + %221 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !52 + %222 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %220, i64 %221, i1 true) #6, !dbg !52 + %223 = bitcast i32 %222 to <2 x bfloat>, !dbg !52 + %224 = extractelement <2 x bfloat> %223, i64 0, !dbg !52 + %225 = extractelement <2 x bfloat> %223, i64 1, !dbg !52 + %226 = fpext bfloat %224 to float, !dbg !53 + %227 = fpext bfloat %225 to float, !dbg !53 + %228 = getelementptr bfloat, ptr addrspace(1) %3, i64 %214, !dbg !54 + %229 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %230 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %228, i64 %229, i1 true) #6, !dbg !55 + %231 = bitcast i32 %230 to <2 x bfloat>, !dbg !55 + %232 = extractelement <2 x bfloat> %231, i64 0, !dbg !55 + %233 = extractelement <2 x bfloat> %231, i64 1, !dbg !55 + %234 = fpext bfloat %232 to float, !dbg !56 + %235 = fpext bfloat %233 to float, !dbg !56 + %236 = or disjoint i64 %214, %210, !dbg !57 + %237 = getelementptr float, ptr addrspace(1) %4, i64 %236, !dbg !58 + %238 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !59 + %239 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %237, i64 %238, i1 true) #6, !dbg !59 + %240 = extractvalue { i32, i32 } %239, 0, !dbg !59 + %241 = extractvalue { i32, i32 } %239, 1, !dbg !59 + %242 = bitcast i32 %240 to float, !dbg !59 + %243 = bitcast i32 %241 to float, !dbg !59 + %244 = getelementptr float, ptr addrspace(1) %5, i64 %236, !dbg !60 + %245 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !61 + %246 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %244, i64 %245, i1 true) #6, !dbg !61 + %247 = extractvalue { i32, i32 } %246, 0, !dbg !61 + %248 = extractvalue { i32, i32 } %246, 1, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %249 = insertelement <1 x i32> poison, i32 %247, i64 0, !dbg !61 + store <1 x i32> %249, ptr addrspace(3) %168, align 4, !dbg !61 + %250 = insertelement <1 x i32> poison, i32 %248, i64 0, !dbg !61 + store <1 x i32> %250, ptr addrspace(3) %170, align 4, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %251 = load float, ptr addrspace(3) %176, align 4, !dbg !61 + %252 = load float, ptr addrspace(3) %178, align 4, !dbg !61 + %253 = or disjoint i32 %24, %217, !dbg !62 + %254 = sext i32 %253 to i64, !dbg !63 + %255 = getelementptr bfloat, ptr addrspace(1) %2, i64 %254, !dbg !63 + %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !64 + %257 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %255, i64 %256, i1 true) #6, !dbg !64 + %258 = bitcast i32 %257 to <2 x bfloat>, !dbg !64 + %259 = extractelement <2 x bfloat> %258, i64 0, !dbg !64 + %260 = extractelement <2 x bfloat> %258, i64 1, !dbg !64 + %261 = fpext bfloat %259 to float, !dbg !65 + %262 = fpext bfloat %260 to float, !dbg !65 + %263 = getelementptr bfloat, ptr addrspace(1) %6, i64 %214, !dbg !66 + %264 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !67 + %265 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %263, i64 %264, i1 true) #6, !dbg !67 + %266 = bitcast i32 %265 to <2 x bfloat>, !dbg !67 + %267 = extractelement <2 x bfloat> %266, i64 0, !dbg !67 + %268 = extractelement <2 x bfloat> %266, i64 1, !dbg !67 + %269 = fpext bfloat %267 to float, !dbg !68 + %270 = fpext bfloat %268 to float, !dbg !68 + %271 = or disjoint i64 %215, 1, !dbg !69 + %272 = or disjoint i64 %215, 33, !dbg !69 + %273 = trunc nuw nsw i64 %271 to i32, !dbg !70 + %274 = or disjoint i32 %182, %273, !dbg !70 + %275 = trunc nuw nsw i64 %272 to i32, !dbg !70 + %276 = or disjoint i32 %182, %275, !dbg !70 + %277 = sext i32 %274 to i64, !dbg !71 + %278 = getelementptr bfloat, ptr addrspace(1) %2, i64 %277, !dbg !71 + %279 = sext i32 %276 to i64, !dbg !71 + %280 = getelementptr bfloat, ptr addrspace(1) %2, i64 %279, !dbg !71 + %281 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %282 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %278, i64 %281, i1 %179) #6, !dbg !72 + %283 = bitcast i16 %282 to bfloat, !dbg !72 + %284 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72 + %285 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %280, i64 %284, i1 %179) #6, !dbg !72 + %286 = bitcast i16 %285 to bfloat, !dbg !72 + %287 = fpext bfloat %283 to float, !dbg !73 + %288 = fpext bfloat %286 to float, !dbg !73 + %289 = fmul float %143, %287, !dbg !41 + %290 = fmul float %143, %288, !dbg !41 + %291 = getelementptr bfloat, ptr addrspace(1) %3, i64 %271, !dbg !74 + %292 = getelementptr bfloat, ptr addrspace(1) %3, i64 %272, !dbg !74 + %293 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %291, i64 %293, i1 %179) #6, !dbg !75 + %295 = bitcast i16 %294 to bfloat, !dbg !75 + %296 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75 + %297 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %292, i64 %296, i1 %179) #6, !dbg !75 + %298 = bitcast i16 %297 to bfloat, !dbg !75 + %299 = fpext bfloat %295 to float, !dbg !76 + %300 = fpext bfloat %298 to float, !dbg !76 + %301 = fmul float %289, %299, !dbg !77 + %302 = fmul float %290, %300, !dbg !77 + %303 = fsub float 0.000000e+00, %301, !dbg !78 + %304 = fsub float 0.000000e+00, %302, !dbg !78 + %305 = trunc nuw nsw i64 %215 to i32, !dbg !79 + %306 = or disjoint i32 %182, %305, !dbg !79 + %307 = trunc nuw nsw i64 %216 to i32, !dbg !79 + %308 = or disjoint i32 %182, %307, !dbg !79 + %309 = sext i32 %306 to i64, !dbg !80 + %310 = getelementptr bfloat, ptr addrspace(1) %2, i64 %309, !dbg !80 + %311 = sext i32 %308 to i64, !dbg !80 + %312 = getelementptr bfloat, ptr addrspace(1) %2, i64 %311, !dbg !80 + %313 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %314 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %310, i64 %313, i1 %183) #6, !dbg !81 + %315 = bitcast i16 %314 to bfloat, !dbg !81 + %316 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81 + %317 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %312, i64 %316, i1 %183) #6, !dbg !81 + %318 = bitcast i16 %317 to bfloat, !dbg !81 + %319 = fpext bfloat %315 to float, !dbg !82 + %320 = fpext bfloat %318 to float, !dbg !82 + %321 = fmul float %143, %319, !dbg !83 + %322 = fmul float %143, %320, !dbg !83 + %323 = getelementptr bfloat, ptr addrspace(1) %3, i64 %215, !dbg !84 + %324 = getelementptr bfloat, ptr addrspace(1) %3, i64 %216, !dbg !84 + %325 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %326 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %323, i64 %325, i1 %183) #6, !dbg !85 + %327 = bitcast i16 %326 to bfloat, !dbg !85 + %328 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85 + %329 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %324, i64 %328, i1 %183) #6, !dbg !85 + %330 = bitcast i16 %329 to bfloat, !dbg !85 + %331 = fpext bfloat %327 to float, !dbg !86 + %332 = fpext bfloat %330 to float, !dbg !86 + %333 = fmul float %321, %331, !dbg !87 + %334 = fmul float %322, %332, !dbg !87 + %335 = select i1 %179, float %303, float %333, !dbg !88 + %336 = select i1 %179, float %304, float %334, !dbg !88 + %337 = fmul float %.0.i4, %226, !dbg !89 + %338 = fmul float %.0.i4, %227, !dbg !89 + %339 = fmul float %337, %234, !dbg !90 + %340 = fmul float %338, %235, !dbg !90 + %341 = fmul float %339, %242, !dbg !91 + %342 = fmul float %340, %243, !dbg !91 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91 + store float %341, ptr addrspace(3) %168, align 4, !dbg !91 + store float %342, ptr addrspace(3) %170, align 4, !dbg !91 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91 + %343 = load float, ptr addrspace(3) %176, align 4, !dbg !91 + %344 = load float, ptr addrspace(3) %178, align 4, !dbg !91 + %345 = fmul float %251, %335, !dbg !92 + %346 = fmul float %252, %336, !dbg !92 + %347 = fadd float %345, %343, !dbg !93 + %348 = fadd float %346, %344, !dbg !93 + %349 = or disjoint i32 %184, %305, !dbg !94 + %350 = or disjoint i32 %184, %307, !dbg !94 + %351 = sext i32 %349 to i64, !dbg !95 + %352 = getelementptr bfloat, ptr addrspace(1) %2, i64 %351, !dbg !95 + %353 = sext i32 %350 to i64, !dbg !95 + %354 = getelementptr bfloat, ptr addrspace(1) %2, i64 %353, !dbg !95 + %355 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %356 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %352, i64 %355, i1 %179) #6, !dbg !96 + %357 = bitcast i16 %356 to bfloat, !dbg !96 + %358 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96 + %359 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %354, i64 %358, i1 %179) #6, !dbg !96 + %360 = bitcast i16 %359 to bfloat, !dbg !96 + %361 = fpext bfloat %357 to float, !dbg !97 + %362 = fpext bfloat %360 to float, !dbg !97 + %363 = fmul float %156, %361, !dbg !45 + %364 = fmul float %156, %362, !dbg !45 + %365 = getelementptr bfloat, ptr addrspace(1) %6, i64 %271, !dbg !98 + %366 = getelementptr bfloat, ptr addrspace(1) %6, i64 %272, !dbg !98 + %367 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %368 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %365, i64 %367, i1 %179) #6, !dbg !99 + %369 = bitcast i16 %368 to bfloat, !dbg !99 + %370 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99 + %371 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %366, i64 %370, i1 %179) #6, !dbg !99 + %372 = bitcast i16 %371 to bfloat, !dbg !99 + %373 = fpext bfloat %369 to float, !dbg !100 + %374 = fpext bfloat %372 to float, !dbg !100 + %375 = fmul float %363, %373, !dbg !101 + %376 = fmul float %364, %374, !dbg !101 + %377 = fsub float 0.000000e+00, %375, !dbg !102 + %378 = fsub float 0.000000e+00, %376, !dbg !102 + %379 = or disjoint i32 %185, %305, !dbg !103 + %380 = or disjoint i32 %185, %307, !dbg !103 + %381 = sext i32 %379 to i64, !dbg !104 + %382 = getelementptr bfloat, ptr addrspace(1) %2, i64 %381, !dbg !104 + %383 = sext i32 %380 to i64, !dbg !104 + %384 = getelementptr bfloat, ptr addrspace(1) %2, i64 %383, !dbg !104 + %385 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %386 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %382, i64 %385, i1 %183) #6, !dbg !105 + %387 = bitcast i16 %386 to bfloat, !dbg !105 + %388 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105 + %389 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %384, i64 %388, i1 %183) #6, !dbg !105 + %390 = bitcast i16 %389 to bfloat, !dbg !105 + %391 = fpext bfloat %387 to float, !dbg !106 + %392 = fpext bfloat %390 to float, !dbg !106 + %393 = fmul float %156, %391, !dbg !107 + %394 = fmul float %156, %392, !dbg !107 + %395 = getelementptr bfloat, ptr addrspace(1) %6, i64 %215, !dbg !108 + %396 = getelementptr bfloat, ptr addrspace(1) %6, i64 %216, !dbg !108 + %397 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %398 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %395, i64 %397, i1 %183) #6, !dbg !109 + %399 = bitcast i16 %398 to bfloat, !dbg !109 + %400 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109 + %401 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %396, i64 %400, i1 %183) #6, !dbg !109 + %402 = bitcast i16 %401 to bfloat, !dbg !109 + %403 = fpext bfloat %399 to float, !dbg !110 + %404 = fpext bfloat %402 to float, !dbg !110 + %405 = fmul float %393, %403, !dbg !111 + %406 = fmul float %394, %404, !dbg !111 + %407 = select i1 %179, float %377, float %405, !dbg !88 + %408 = select i1 %179, float %378, float %406, !dbg !88 + %409 = fmul float %.0.i10, %261, !dbg !112 + %410 = fmul float %.0.i10, %262, !dbg !112 + %411 = fmul float %409, %269, !dbg !113 + %412 = fmul float %410, %270, !dbg !113 + %413 = fmul float %411, %242, !dbg !114 + %414 = fmul float %412, %243, !dbg !114 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114 + store float %413, ptr addrspace(3) %168, align 4, !dbg !114 + store float %414, ptr addrspace(3) %170, align 4, !dbg !114 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114 + %415 = load float, ptr addrspace(3) %176, align 4, !dbg !114 + %416 = load float, ptr addrspace(3) %178, align 4, !dbg !114 + %417 = fmul float %251, %407, !dbg !115 + %418 = fmul float %252, %408, !dbg !115 + %419 = fadd float %417, %415, !dbg !116 + %420 = fadd float %418, %416, !dbg !116 + %421 = or disjoint i64 %214, %211, !dbg !117 + %422 = getelementptr bfloat, ptr addrspace(1) %0, i64 %421, !dbg !118 + %423 = fptrunc float %347 to bfloat, !dbg !119 + %424 = fptrunc float %348 to bfloat, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + store bfloat %423, ptr addrspace(3) %196, align 2, !dbg !119 + store bfloat %424, ptr addrspace(3) %198, align 2, !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + %425 = load bfloat, ptr addrspace(3) %207, align 2, !dbg !119 + %426 = load bfloat, ptr addrspace(3) %208, align 2, !dbg !119 + %427 = insertelement <2 x bfloat> poison, bfloat %425, i64 0, !dbg !119 + %428 = insertelement <2 x bfloat> %427, bfloat %426, i64 1, !dbg !119 + %429 = bitcast <2 x bfloat> %428 to i32, !dbg !119 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %429, ptr addrspace(1) %422, i1 true) #6, !dbg !119 + %430 = getelementptr bfloat, ptr addrspace(1) %1, i64 %421, !dbg !120 + %431 = fptrunc float %419 to bfloat, !dbg !121 + %432 = fptrunc float %420 to bfloat, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + store bfloat %431, ptr addrspace(3) %196, align 2, !dbg !121 + store bfloat %432, ptr addrspace(3) %198, align 2, !dbg !121 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121 + %433 = load bfloat, ptr addrspace(3) %207, align 2, !dbg !121 + %434 = load bfloat, ptr addrspace(3) %208, align 2, !dbg !121 + %435 = insertelement <2 x bfloat> poison, bfloat %433, i64 0, !dbg !121 + %436 = insertelement <2 x bfloat> %435, bfloat %434, i64 1, !dbg !121 + %437 = bitcast <2 x bfloat> %436 to i32, !dbg !121 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %437, ptr addrspace(1) %430, i1 true) #6, !dbg !121 + br i1 %213, label %212, label %438, !dbg !47 + +438: ; preds = %212 + ret void, !dbg !122 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 23, column: 33, scope: !5) +!10 = !DILocation(line: 24, column: 44, scope: !5) +!11 = !DILocation(line: 24, column: 23, scope: !5) +!12 = !DILocation(line: 26, column: 37, scope: !5) +!13 = !DILocation(line: 29, column: 19, scope: !5) +!14 = !DILocation(line: 33, column: 43, scope: !5) +!15 = !DILocation(line: 39, column: 57, scope: !5) +!16 = !DILocation(line: 39, column: 34, scope: !5) +!17 = !DILocation(line: 39, column: 68, scope: !5) +!18 = !DILocation(line: 39, column: 121, scope: !5) +!19 = !DILocation(line: 40, column: 50, scope: !5) +!20 = !DILocation(line: 40, column: 34, scope: !5) +!21 = !DILocation(line: 40, column: 61, scope: !5) +!22 = !DILocation(line: 40, column: 114, scope: !5) +!23 = !DILocation(line: 42, column: 22, scope: !5) +!24 = !DILocation(line: 47, column: 22, scope: !5) +!25 = !DILocation(line: 34, column: 31, scope: !5) +!26 = !DILocation(line: 44, column: 23, scope: !5) +!27 = !DILocation(line: 49, column: 25, scope: !5) +!28 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !31) +!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0) +!30 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language") +!31 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !32) +!32 = !DILocation(line: 51, column: 25, scope: !33) +!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!34 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !35) +!35 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !36) +!36 = !DILocation(line: 52, column: 27, scope: !33) +!37 = !DILocation(line: 63, column: 46, scope: !5) +!38 = !DILocation(line: 75, column: 25, scope: !5) +!39 = !DILocation(line: 77, column: 24, scope: !5) +!40 = !DILocation(line: 78, column: 32, scope: !5) +!41 = !DILocation(line: 79, column: 24, scope: !5) +!42 = !DILocation(line: 123, column: 24, scope: !5) +!43 = !DILocation(line: 124, column: 24, scope: !5) +!44 = !DILocation(line: 125, column: 32, scope: !5) +!45 = !DILocation(line: 126, column: 24, scope: !5) +!46 = !DILocation(line: 161, column: 43, scope: !5) +!47 = !DILocation(line: 53, column: 43, scope: !5) +!48 = !DILocation(line: 54, column: 31, scope: !5) +!49 = !DILocation(line: 72, column: 41, scope: !5) +!50 = !DILocation(line: 61, column: 51, scope: !5) +!51 = !DILocation(line: 61, column: 35, scope: !5) +!52 = !DILocation(line: 61, column: 62, scope: !5) +!53 = !DILocation(line: 61, column: 115, scope: !5) +!54 = !DILocation(line: 62, column: 35, scope: !5) +!55 = !DILocation(line: 62, column: 42, scope: !5) +!56 = !DILocation(line: 62, column: 95, scope: !5) +!57 = !DILocation(line: 63, column: 42, scope: !5) +!58 = !DILocation(line: 63, column: 35, scope: !5) +!59 = !DILocation(line: 63, column: 51, scope: !5) +!60 = !DILocation(line: 64, column: 35, scope: !5) +!61 = !DILocation(line: 64, column: 51, scope: !5) +!62 = !DILocation(line: 65, column: 58, scope: !5) +!63 = !DILocation(line: 65, column: 35, scope: !5) +!64 = !DILocation(line: 65, column: 69, scope: !5) +!65 = !DILocation(line: 65, column: 123, scope: !5) +!66 = !DILocation(line: 66, column: 36, scope: !5) +!67 = !DILocation(line: 66, column: 43, scope: !5) +!68 = !DILocation(line: 66, column: 96, scope: !5) +!69 = !DILocation(line: 72, column: 39, scope: !5) +!70 = !DILocation(line: 72, column: 57, scope: !5) +!71 = !DILocation(line: 72, column: 35, scope: !5) +!72 = !DILocation(line: 72, column: 68, scope: !5) +!73 = !DILocation(line: 72, column: 129, scope: !5) +!74 = !DILocation(line: 80, column: 35, scope: !5) +!75 = !DILocation(line: 80, column: 85, scope: !5) +!76 = !DILocation(line: 80, column: 146, scope: !5) +!77 = !DILocation(line: 82, column: 24, scope: !5) +!78 = !DILocation(line: 84, column: 17, scope: !5) +!79 = !DILocation(line: 90, column: 53, scope: !5) +!80 = !DILocation(line: 90, column: 35, scope: !5) +!81 = !DILocation(line: 90, column: 64, scope: !5) +!82 = !DILocation(line: 90, column: 125, scope: !5) +!83 = !DILocation(line: 97, column: 24, scope: !5) +!84 = !DILocation(line: 98, column: 35, scope: !5) +!85 = !DILocation(line: 98, column: 81, scope: !5) +!86 = !DILocation(line: 98, column: 142, scope: !5) +!87 = !DILocation(line: 100, column: 24, scope: !5) +!88 = !DILocation(line: 0, scope: !5) +!89 = !DILocation(line: 111, column: 24, scope: !5) +!90 = !DILocation(line: 113, column: 24, scope: !5) +!91 = !DILocation(line: 116, column: 24, scope: !5) +!92 = !DILocation(line: 118, column: 24, scope: !5) +!93 = !DILocation(line: 119, column: 24, scope: !5) +!94 = !DILocation(line: 121, column: 60, scope: !5) +!95 = !DILocation(line: 121, column: 35, scope: !5) +!96 = !DILocation(line: 121, column: 71, scope: !5) +!97 = !DILocation(line: 121, column: 132, scope: !5) +!98 = !DILocation(line: 127, column: 35, scope: !5) +!99 = !DILocation(line: 127, column: 85, scope: !5) +!100 = !DILocation(line: 127, column: 146, scope: !5) +!101 = !DILocation(line: 129, column: 24, scope: !5) +!102 = !DILocation(line: 131, column: 17, scope: !5) +!103 = !DILocation(line: 134, column: 60, scope: !5) +!104 = !DILocation(line: 134, column: 35, scope: !5) +!105 = !DILocation(line: 134, column: 71, scope: !5) +!106 = !DILocation(line: 134, column: 132, scope: !5) +!107 = !DILocation(line: 139, column: 24, scope: !5) +!108 = !DILocation(line: 140, column: 35, scope: !5) +!109 = !DILocation(line: 140, column: 81, scope: !5) +!110 = !DILocation(line: 140, column: 142, scope: !5) +!111 = !DILocation(line: 142, column: 24, scope: !5) +!112 = !DILocation(line: 151, column: 25, scope: !5) +!113 = !DILocation(line: 153, column: 26, scope: !5) +!114 = !DILocation(line: 156, column: 26, scope: !5) +!115 = !DILocation(line: 158, column: 26, scope: !5) +!116 = !DILocation(line: 159, column: 26, scope: !5) +!117 = !DILocation(line: 161, column: 39, scope: !5) +!118 = !DILocation(line: 161, column: 32, scope: !5) +!119 = !DILocation(line: 161, column: 55, scope: !5) +!120 = !DILocation(line: 162, column: 32, scope: !5) +!121 = !DILocation(line: 162, column: 56, scope: !5) +!122 = !DILocation(line: 53, column: 4, scope: !5) diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..79eaf8265f795a788a1d5127bcb7420c106ebde6 --- /dev/null +++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx @@ -0,0 +1,1190 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 9.1 +.target sm_89 +.address_size 64 + + // .globl triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 +.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7, + .param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10 +) +.reqntid 256 +{ + .reg .pred %p<6>; + .reg .b16 %rs<42>; + .reg .b32 %r<219>; + .reg .b64 %rd<96>; + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0 + +// %bb.0: // %__nv_rsqrtf.exit + ld.param.b64 %rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6]; + ld.param.b64 %rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5]; + ld.param.b64 %rd9, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4]; + ld.param.b64 %rd8, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3]; + ld.param.b64 %rd7, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2]; + ld.param.b64 %rd6, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1]; + ld.param.b64 %rd5, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28 + mov.u32 %r23, %ctaid.x; + .loc 1 23 33 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33 + shl.b32 %r24, %r23, 3; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + mov.u32 %r25, %tid.x; + and.b32 %r26, %r25, 224; + bfe.u32 %r27, %r25, 5, 3; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r28, %r27, %r24; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + shl.b32 %r29, %r25, 1; + and.b32 %r30, %r29, 62; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + bfe.s32 %r31, %r23, 28, 1; + shr.u32 %r32, %r31, 27; + add.s32 %r33, %r28, %r32; + shr.s32 %r34, %r33, 5; + shl.b32 %r35, %r28, 7; + shl.b32 %r36, %r34, 15; + add.s32 %r1, %r36, %r35; + add.s32 %r2, %r1, 4096; + .loc 1 33 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43 + cvt.u64.u32 %rd1, %r30; + .loc 1 39 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57 + or.b32 %r37, %r2, %r30; + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + mad.wide.s32 %rd12, %r37, 2, %rd7; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0; + // end inline asm + mov.b32 %r19, 0; + mov.pred %p2, -1; + // begin inline asm + mov.u32 %r18, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r18 }, [ %rd12 + 0 ], %rd13; + // end inline asm + mov.b32 {%rs1, %rs2}, %r18; + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r38, %rs1; + cvt.f32.bf16 %r39, %rs2; + .loc 1 40 50 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:50 + or.b32 %r40, %r1, %r30; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + mad.wide.s32 %rd14, %r40, 2, %rd7; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r20, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r20 }, [ %rd14 + 0 ], %rd15; + // end inline asm + mov.b32 {%rs3, %rs4}, %r20; + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r41, %rs3; + cvt.f32.bf16 %r42, %rs4; + .loc 1 39 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34 + cvt.s64.s32 %rd20, %r2; + or.b64 %rd21, %rd20, %rd1; + shl.b64 %rd22, %rd21, 1; + add.s64 %rd23, %rd7, %rd22; + add.s64 %rd16, %rd23, 128; + .loc 1 39 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68 + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r21, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r21 }, [ %rd16 + 0 ], %rd17; + // end inline asm + mov.b32 {%rs5, %rs6}, %r21; + .loc 1 39 121 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121 + cvt.f32.bf16 %r43, %rs5; + cvt.f32.bf16 %r44, %rs6; + .loc 1 40 34 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34 + cvt.s64.s32 %rd24, %r1; + or.b64 %rd25, %rd24, %rd1; + shl.b64 %rd26, %rd25, 1; + add.s64 %rd27, %rd7, %rd26; + add.s64 %rd18, %rd27, 128; + .loc 1 40 61 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61 + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r22, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r22 }, [ %rd18 + 0 ], %rd19; + // end inline asm + mov.b32 {%rs7, %rs8}, %r22; + .loc 1 40 114 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114 + cvt.f32.bf16 %r45, %rs7; + cvt.f32.bf16 %r46, %rs8; + .loc 1 42 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22 + mul.f32 %r47, %r43, %r43; + mul.f32 %r48, %r44, %r44; + .loc 1 44 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:44:23 + fma.rn.f32 %r49, %r38, %r38, %r47; + fma.rn.f32 %r50, %r39, %r39, %r48; + .loc 1 47 22 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22 + mul.f32 %r51, %r45, %r45; + mul.f32 %r52, %r46, %r46; + .loc 1 49 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25 + fma.rn.f32 %r53, %r41, %r41, %r51; + fma.rn.f32 %r54, %r42, %r42, %r52; + .loc 1 24 44 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44 + and.b32 %r55, %r25, 7; + .loc 1 24 23 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23 + or.b32 %r56, %r24, %r55; + .loc 1 26 37 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37 + and.b32 %r57, %r25, 248; + bfe.u32 %r58, %r25, 3, 5; + .loc 1 29 19 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19 + add.s32 %r59, %r56, %r32; +$L__tmp1: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r60, %r49, %r50; +$L__tmp2: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1; +$L__tmp3: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r62, %r60, %r61; +$L__tmp4: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r63, %r62, 8, 31, -1; +$L__tmp5: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r64, %r62, %r63; +$L__tmp6: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r65, %r64, 4, 31, -1; +$L__tmp7: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r66, %r64, %r65; +$L__tmp8: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r67, %r66, 2, 31, -1; +$L__tmp9: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r68, %r66, %r67; +$L__tmp10: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] + shfl.sync.bfly.b32 %r69, %r68, 1, 31, -1; +$L__tmp11: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ] + add.f32 %r70, %r68, %r69; +$L__tmp12: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r71, %r53, %r54; +$L__tmp13: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r72, %r71, 16, 31, -1; +$L__tmp14: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r73, %r71, %r72; +$L__tmp15: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r74, %r73, 8, 31, -1; +$L__tmp16: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r75, %r73, %r74; +$L__tmp17: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r76, %r75, 4, 31, -1; +$L__tmp18: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r77, %r75, %r76; +$L__tmp19: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r78, %r77, 2, 31, -1; +$L__tmp20: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r79, %r77, %r78; +$L__tmp21: + .loc 2 293 36 // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] + shfl.sync.bfly.b32 %r80, %r79, 1, 31, -1; +$L__tmp22: + .loc 2 263 15 // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ] + add.f32 %r81, %r79, %r80; +$L__tmp23: + .loc 1 63 46 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46 + shl.b32 %r82, %r34, 7; + mov.b32 %r83, 0f43000000; + .loc 1 75 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25 + div.full.f32 %r84, %r81, %r83; + .loc 1 77 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24 + add.f32 %r85, %r84, 0f358637BD; + .loc 1 78 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32 + rsqrt.approx.ftz.f32 %r3, %r85; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + shr.u32 %r86, %r26, 3; + mov.b32 %r87, global_smem; + add.s32 %r88, %r87, %r86; + st.shared.b32 [%r88], %r3; + bar.sync 0; + shl.b32 %r89, %r55, 2; + add.s32 %r90, %r87, %r89; + ld.shared.b32 %r4, [%r90]; + .loc 1 123 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24 + div.full.f32 %r91, %r70, %r83; + .loc 1 124 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24 + add.f32 %r92, %r91, 0f358637BD; + .loc 1 125 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32 + rsqrt.approx.ftz.f32 %r5, %r92; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + bar.sync 0; + st.shared.b32 [%r88], %r5; + bar.sync 0; + ld.shared.b32 %r6, [%r90]; + bfe.u32 %r7, %r57, 3, 1; + and.b32 %r93, %r58, 30; + and.b32 %r94, %r25, 15; + shl.b32 %r95, %r94, 3; + shl.b32 %r96, %r26, 2; + shr.u32 %r97, %r26, 1; + shr.u32 %r98, %r25, 2; + and.b32 %r99, %r98, 4; + or.b32 %r100, %r95, %r96; + xor.b32 %r101, %r100, %r97; + or.b32 %r102, %r101, %r99; + add.s32 %r8, %r87, %r102; + xor.b32 %r103, %r102, 4; + add.s32 %r9, %r87, %r103; + shl.b32 %r104, %r94, 7; + shl.b32 %r105, %r55, 4; + shr.u32 %r106, %r57, 1; + xor.b32 %r107, %r105, %r106; + or.b32 %r108, %r107, %r104; + add.s32 %r10, %r87, %r108; + xor.b32 %r109, %r108, 4; + add.s32 %r11, %r87, %r109; + shl.b32 %r110, %r56, 7; + shl.b32 %r111, %r59, 10; + and.b32 %r112, %r111, -32768; + add.s32 %r12, %r112, %r110; + add.s32 %r13, %r12, 4097; + add.s32 %r14, %r12, 4096; + shl.b32 %r113, %r55, 7; + shr.u32 %r114, %r25, 1; + and.b32 %r115, %r114, 12; + and.b32 %r116, %r98, 48; + shr.u32 %r117, %r25, 4; + and.b32 %r118, %r117, 2; + or.b32 %r119, %r113, %r118; + or.b32 %r120, %r105, %r115; + xor.b32 %r121, %r120, %r116; + or.b32 %r122, %r121, %r119; + add.s32 %r15, %r87, %r122; + xor.b32 %r123, %r122, 64; + add.s32 %r16, %r87, %r123; + shl.b32 %r124, %r25, 2; + and.b32 %r125, %r124, 1008; + shl.b32 %r126, %r25, 3; + and.b32 %r127, %r126, 8; + and.b32 %r128, %r25, 2; + xor.b32 %r129, %r125, %r97; + add.s32 %r130, %r87, %r127; + add.s32 %r131, %r130, %r128; + add.s32 %r17, %r131, %r129; + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + cvt.u64.u32 %rd2, %r93; + cvt.s64.s32 %rd3, %r82; + cvt.s64.s32 %rd4, %r35; + mov.b64 %rd95, 0; + mov.pred %p5, %p2; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 0 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43 + mov.pred %p1, %p5; + setp.ne.b32 %p4, %r7, 0; + setp.eq.b32 %p3, %r7, 0; + .loc 1 54 31 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:54:31 + or.b64 %rd74, %rd95, %rd1; + .loc 1 72 41 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41 + or.b64 %rd75, %rd95, %rd2; + .loc 1 61 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:51 + cvt.u32.u64 %r142, %rd74; + or.b32 %r143, %r1, %r142; + .loc 1 61 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35 + mad.wide.s32 %rd29, %r143, 2, %rd7; + .loc 1 61 62 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62 + // begin inline asm + mov.u64 %rd28, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd28, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r132, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r132 }, [ %rd29 + 0 ], %rd28; + // end inline asm + mov.b32 {%rs26, %rs27}, %r132; + .loc 1 61 115 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115 + cvt.f32.bf16 %r144, %rs26; + cvt.f32.bf16 %r145, %rs27; + .loc 1 62 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35 + shl.b64 %rd76, %rd74, 1; + add.s64 %rd31, %rd8, %rd76; + .loc 1 62 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42 + // begin inline asm + mov.u64 %rd30, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd30, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r133, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r133 }, [ %rd31 + 0 ], %rd30; + // end inline asm + mov.b32 {%rs28, %rs29}, %r133; + .loc 1 62 95 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95 + cvt.f32.bf16 %r146, %rs28; + cvt.f32.bf16 %r147, %rs29; + .loc 1 63 42 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42 + or.b64 %rd77, %rd74, %rd3; + .loc 1 63 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35 + shl.b64 %rd78, %rd77, 2; + add.s64 %rd33, %rd9, %rd78; + .loc 1 63 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51 + // begin inline asm + mov.u64 %rd32, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd32, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r134, %r19; + mov.u32 %r135, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r134, %r135 }, [ %rd33 + 0 ], %rd32; + // end inline asm + .loc 1 64 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35 + add.s64 %rd35, %rd10, %rd78; + .loc 1 64 51 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51 + // begin inline asm + mov.u64 %rd34, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd34, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r136, %r19; + mov.u32 %r137, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r136, %r137 }, [ %rd35 + 0 ], %rd34; + // end inline asm + bar.sync 0; + st.shared.b32 [%r8], %r136; + st.shared.b32 [%r9+1024], %r137; + bar.sync 0; + ld.shared.b32 %r148, [%r10]; + ld.shared.b32 %r149, [%r11]; + .loc 1 65 58 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:58 + or.b32 %r150, %r2, %r142; + .loc 1 65 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35 + mad.wide.s32 %rd37, %r150, 2, %rd7; + .loc 1 65 69 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69 + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd36, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r138, %r19; + @%p2 ld.global.L1::evict_first.L2::cache_hint.b32 { %r138 }, [ %rd37 + 0 ], %rd36; + // end inline asm + mov.b32 {%rs30, %rs31}, %r138; + .loc 1 65 123 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123 + cvt.f32.bf16 %r151, %rs30; + cvt.f32.bf16 %r152, %rs31; + .loc 1 66 36 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36 + add.s64 %rd39, %rd11, %rd76; + .loc 1 66 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43 + // begin inline asm + mov.u64 %rd38, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd38, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r139, %r19; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r139 }, [ %rd39 + 0 ], %rd38; + // end inline asm + mov.b32 {%rs32, %rs33}, %r139; + .loc 1 66 96 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96 + cvt.f32.bf16 %r153, %rs32; + cvt.f32.bf16 %r154, %rs33; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + cvt.s64.s32 %rd79, %r12; + .loc 1 72 57 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:57 + cvt.u32.u64 %r155, %rd75; + .loc 1 72 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35 + cvt.s64.s32 %rd80, %rd75; + add.s64 %rd81, %rd79, %rd80; + shl.b64 %rd82, %rd81, 1; + add.s64 %rd83, %rd7, %rd82; + add.s64 %rd41, %rd83, 2; + add.s64 %rd43, %rd83, 66; + .loc 1 72 68 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68 + // begin inline asm + mov.u64 %rd40, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd40, 1.0; + // end inline asm + mov.b16 %rs10, 0; + // begin inline asm + mov.u16 %rs9, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd41 + 0 ], %rd40; + // end inline asm + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd43 + 0 ], %rd42; + // end inline asm + .loc 1 72 129 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129 + cvt.f32.bf16 %r156, %rs9; + cvt.f32.bf16 %r157, %rs11; + .loc 1 79 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24 + mul.f32 %r158, %r4, %r156; + mul.f32 %r159, %r4, %r157; + .loc 1 80 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35 + shl.b64 %rd84, %rd75, 1; + add.s64 %rd53, %rd8, %rd84; + add.s64 %rd45, %rd53, 2; + add.s64 %rd47, %rd53, 66; + .loc 1 80 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85 + // begin inline asm + mov.u64 %rd44, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd44, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd45 + 0 ], %rd44; + // end inline asm + // begin inline asm + mov.u64 %rd46, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd46, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd47 + 0 ], %rd46; + // end inline asm + .loc 1 80 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146 + cvt.f32.bf16 %r160, %rs12; + cvt.f32.bf16 %r161, %rs13; + .loc 1 84 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17 + neg.f32 %r162, %r158; + fma.rn.f32 %r163, %r162, %r160, 0f00000000; + neg.f32 %r164, %r159; + fma.rn.f32 %r165, %r164, %r161, 0f00000000; + .loc 1 90 53 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53 + or.b32 %r166, %r12, %r155; + .loc 1 90 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35 + mad.wide.s32 %rd49, %r166, 2, %rd7; + add.s64 %rd51, %rd83, 64; + .loc 1 90 64 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64 + // begin inline asm + mov.u64 %rd48, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd48, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs14, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd49 + 0 ], %rd48; + // end inline asm + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd51 + 0 ], %rd50; + // end inline asm + .loc 1 90 125 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125 + cvt.f32.bf16 %r167, %rs14; + cvt.f32.bf16 %r168, %rs15; + .loc 1 97 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24 + mul.f32 %r169, %r4, %r167; + mul.f32 %r170, %r4, %r168; + .loc 1 98 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35 + add.s64 %rd55, %rd53, 64; + .loc 1 98 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81 + // begin inline asm + mov.u64 %rd52, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd52, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs16, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd53 + 0 ], %rd52; + // end inline asm + // begin inline asm + mov.u64 %rd54, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd54, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd55 + 0 ], %rd54; + // end inline asm + .loc 1 98 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142 + cvt.f32.bf16 %r171, %rs16; + cvt.f32.bf16 %r172, %rs17; + .loc 1 100 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24 + mul.f32 %r173, %r169, %r171; + mul.f32 %r174, %r170, %r172; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r175, %r163, %r173, %p3; + selp.f32 %r176, %r165, %r174, %p3; + .loc 1 111 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24 + mul.f32 %r177, %r3, %r144; + mul.f32 %r178, %r3, %r145; + .loc 1 113 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24 + mul.f32 %r179, %r177, %r146; + mul.f32 %r180, %r178, %r147; + .loc 1 116 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24 + mul.f32 %r181, %r179, %r134; + mul.f32 %r182, %r180, %r135; + bar.sync 0; + st.shared.b32 [%r8], %r181; + st.shared.b32 [%r9+1024], %r182; + bar.sync 0; + ld.shared.b32 %r183, [%r10]; + ld.shared.b32 %r184, [%r11]; + .loc 1 119 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24 + fma.rn.f32 %r185, %r148, %r175, %r183; + fma.rn.f32 %r186, %r149, %r176, %r184; + .loc 1 121 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60 + or.b32 %r187, %r13, %r155; + .loc 1 121 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35 + mad.wide.s32 %rd57, %r187, 2, %rd7; + cvt.s64.s32 %rd85, %r13; + add.s64 %rd86, %rd85, %rd80; + shl.b64 %rd87, %rd86, 1; + add.s64 %rd88, %rd7, %rd87; + add.s64 %rd59, %rd88, 64; + .loc 1 121 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs18, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd57 + 0 ], %rd56; + // end inline asm + // begin inline asm + mov.u64 %rd58, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd58, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd59 + 0 ], %rd58; + // end inline asm + .loc 1 121 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132 + cvt.f32.bf16 %r188, %rs18; + cvt.f32.bf16 %r189, %rs19; + .loc 1 126 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24 + mul.f32 %r190, %r6, %r188; + mul.f32 %r191, %r6, %r189; + .loc 1 127 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35 + add.s64 %rd69, %rd11, %rd84; + add.s64 %rd61, %rd69, 2; + add.s64 %rd63, %rd69, 66; + .loc 1 127 85 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85 + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs20, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd61 + 0 ], %rd60; + // end inline asm + // begin inline asm + mov.u64 %rd62, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd62, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs10; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd63 + 0 ], %rd62; + // end inline asm + .loc 1 127 146 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146 + cvt.f32.bf16 %r192, %rs20; + cvt.f32.bf16 %r193, %rs21; + .loc 1 131 17 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17 + neg.f32 %r194, %r190; + fma.rn.f32 %r195, %r194, %r192, 0f00000000; + neg.f32 %r196, %r191; + fma.rn.f32 %r197, %r196, %r193, 0f00000000; + .loc 1 134 60 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60 + or.b32 %r198, %r14, %r155; + .loc 1 134 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35 + mad.wide.s32 %rd65, %r198, 2, %rd7; + cvt.s64.s32 %rd89, %r14; + add.s64 %rd90, %rd89, %rd80; + shl.b64 %rd91, %rd90, 1; + add.s64 %rd92, %rd7, %rd91; + add.s64 %rd67, %rd92, 64; + .loc 1 134 71 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71 + // begin inline asm + mov.u64 %rd64, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs22, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd65 + 0 ], %rd64; + // end inline asm + // begin inline asm + mov.u64 %rd66, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd66, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd67 + 0 ], %rd66; + // end inline asm + .loc 1 134 132 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132 + cvt.f32.bf16 %r199, %rs22; + cvt.f32.bf16 %r200, %rs23; + .loc 1 139 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24 + mul.f32 %r201, %r6, %r199; + mul.f32 %r202, %r6, %r200; + .loc 1 140 35 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35 + add.s64 %rd71, %rd69, 64; + .loc 1 140 81 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81 + // begin inline asm + mov.u64 %rd68, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs24, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd69 + 0 ], %rd68; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs25, %rs10; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd71 + 0 ], %rd70; + // end inline asm + .loc 1 140 142 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142 + cvt.f32.bf16 %r203, %rs24; + cvt.f32.bf16 %r204, %rs25; + .loc 1 142 24 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24 + mul.f32 %r205, %r201, %r203; + mul.f32 %r206, %r202, %r204; + .loc 1 0 0 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0 + selp.f32 %r207, %r195, %r205, %p3; + selp.f32 %r208, %r197, %r206, %p3; + .loc 1 151 25 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25 + mul.f32 %r209, %r5, %r151; + mul.f32 %r210, %r5, %r152; + .loc 1 153 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26 + mul.f32 %r211, %r209, %r153; + mul.f32 %r212, %r210, %r154; + .loc 1 156 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26 + mul.f32 %r213, %r211, %r134; + mul.f32 %r214, %r212, %r135; + bar.sync 0; + st.shared.b32 [%r8], %r213; + st.shared.b32 [%r9+1024], %r214; + bar.sync 0; + ld.shared.b32 %r215, [%r10]; + ld.shared.b32 %r216, [%r11]; + .loc 1 159 26 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26 + fma.rn.f32 %r217, %r148, %r207, %r215; + fma.rn.f32 %r218, %r149, %r208, %r216; + .loc 1 161 39 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39 + or.b64 %rd93, %rd74, %rd4; + .loc 1 161 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32 + shl.b64 %rd94, %rd93, 1; + add.s64 %rd72, %rd5, %rd94; + .loc 1 161 55 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55 + cvt.rn.bf16.f32 %rs34, %r185; + cvt.rn.bf16.f32 %rs35, %r186; + bar.sync 0; + st.shared.b16 [%r15], %rs34; + st.shared.b16 [%r16], %rs35; + bar.sync 0; + ld.shared.b16 %rs36, [%r17]; + ld.shared.b16 %rs37, [%r17+4]; + mov.b32 %r140, {%rs36, %rs37}; + // begin inline asm + @%p2 st.global.b32 [ %rd72 + 0 ], { %r140 }; + // end inline asm + .loc 1 162 32 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32 + add.s64 %rd73, %rd6, %rd94; + .loc 1 162 56 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56 + cvt.rn.bf16.f32 %rs38, %r217; + cvt.rn.bf16.f32 %rs39, %r218; + bar.sync 0; + st.shared.b16 [%r15], %rs38; + st.shared.b16 [%r16], %rs39; + bar.sync 0; + ld.shared.b16 %rs40, [%r17]; + ld.shared.b16 %rs41, [%r17+4]; + mov.b32 %r141, {%rs40, %rs41}; + // begin inline asm + @%p2 st.global.b32 [ %rd73 + 0 ], { %r141 }; + // end inline asm + mov.b64 %rd95, 64; + mov.pred %p5, 0; + .loc 1 53 43 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43 + @%p1 bra $L__BB0_1; +// %bb.2: + .loc 1 53 4 // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4 + ret; +$L__tmp24: +$L__func_end0: + // -- End function +} + .file 1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py" + .file 2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 1 // DW_CHILDREN_yes +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 456 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 118 +.b8 113 +.b8 104 +.b8 106 +.b8 116 +.b8 121 +.b8 103 +.b8 55 +.b8 102 +.b8 118 +.b8 120 +.b8 122 +.b8 119 +.b8 116 +.b8 98 +.b8 116 +.b8 116 +.b8 52 +.b8 118 +.b8 114 +.b8 100 +.b8 107 +.b8 98 +.b8 110 +.b8 98 +.b8 54 +.b8 110 +.b8 51 +.b8 50 +.b8 102 +.b8 110 +.b8 114 +.b8 105 +.b8 106 +.b8 106 +.b8 112 +.b8 108 +.b8 51 +.b8 118 +.b8 118 +.b8 52 +.b8 99 +.b8 102 +.b8 113 +.b8 100 +.b8 52 +.b8 109 +.b8 122 +.b8 110 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 97 +.b8 112 +.b8 112 +.b8 47 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 114 +.b8 116 +.b8 95 +.b8 108 +.b8 108 +.b8 109 +.b8 47 +.b8 118 +.b8 105 +.b8 115 +.b8 117 +.b8 97 +.b8 108 +.b8 95 +.b8 103 +.b8 101 +.b8 110 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 102 +.b8 108 +.b8 117 +.b8 120 +.b8 50 +.b8 95 +.b8 107 +.b8 108 +.b8 101 +.b8 105 +.b8 110 +.b8 95 +.b8 57 +.b8 98 +.b8 95 +.b8 78 +.b8 86 +.b8 73 +.b8 68 +.b8 73 +.b8 65 +.b8 95 +.b8 71 +.b8 101 +.b8 70 +.b8 111 +.b8 114 +.b8 99 +.b8 101 +.b8 95 +.b8 82 +.b8 84 +.b8 88 +.b8 95 +.b8 52 +.b8 48 +.b8 57 +.b8 48 +.b8 95 +.b8 115 +.b8 109 +.b8 56 +.b8 57 +.b8 95 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 50 +.b8 46 +.b8 49 +.b8 48 +.b8 46 +.b8 48 +.b8 97 +.b8 48 +.b8 95 +.b8 98 +.b8 52 +.b8 101 +.b8 52 +.b8 101 +.b8 101 +.b8 56 +.b8 49 +.b8 100 +.b8 51 +.b8 46 +.b8 110 +.b8 118 +.b8 50 +.b8 53 +.b8 46 +.b8 49 +.b8 50 +.b8 95 +.b8 99 +.b8 117 +.b8 100 +.b8 97 +.b8 49 +.b8 51 +.b8 95 +.b8 49 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 47 +.b8 98 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 114 +.b8 109 +.b8 115 +.b8 95 +.b8 110 +.b8 111 +.b8 114 +.b8 109 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 110 +.b8 101 +.b8 103 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 115 +.b8 112 +.b8 108 +.b8 105 +.b8 116 +.b8 95 +.b8 119 +.b8 105 +.b8 116 +.b8 104 +.b8 95 +.b8 115 +.b8 105 +.b8 122 +.b8 101 +.b8 115 +.b8 95 +.b8 115 +.b8 116 +.b8 97 +.b8 99 +.b8 107 +.b8 95 +.b8 117 +.b8 110 +.b8 98 +.b8 105 +.b8 110 +.b8 100 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x151:0x7a DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 228 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 4 // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 52 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine +.b32 228 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 2 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 1 +.b8 36 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..50320303cc25b97f14e5ff354baaa2182359b780 --- /dev/null +++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source @@ -0,0 +1,972 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0) +#loc215 = loc(unknown) +#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0) +#loc222 = loc("in_out_ptr0"(#loc)) +#loc223 = loc("in_out_ptr1"(#loc)) +#loc224 = loc("in_ptr0"(#loc)) +#loc225 = loc("in_ptr1"(#loc)) +#loc226 = loc("in_ptr2"(#loc)) +#loc227 = loc("in_ptr3"(#loc)) +#loc228 = loc("in_ptr4"(#loc)) +#loc229 = loc("xnumel"(#loc)) +#loc230 = loc("r0_numel"(#loc)) +#loc432 = loc("input"(#loc213)) +#loc433 = loc("a"(#loc218)) +#loc434 = loc("b"(#loc218)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 73728 : i32 loc(#loc231) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc232) + %xoffset = tt.get_program_id x : i32 loc(#loc233) + %xoffset_2 = arith.constant 8 : i32 loc(#loc234) + %xoffset_3 = arith.constant 8 : i32 loc(#loc234) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc235) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc236) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc237) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc237) + %xmask = arith.constant true loc(#loc238) + %xmask_8 = arith.constant dense : tensor<8x64xi1> loc(#loc238) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc239) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc240) + %x0 = arith.constant 32 : i32 loc(#loc241) + %x0_10 = arith.constant 32 : i32 loc(#loc241) + %x0_11 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc241) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc241) + %x1 = arith.constant 32 : i32 loc(#loc242) + %x1_13 = arith.constant 32 : i32 loc(#loc242) + %x1_14 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc242) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc242) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243) + %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc243) + %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244) + %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc244) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c64_i32 = arith.constant 64 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<8x64xf32>, tensor<8x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc246) + %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc246) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc247) + %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x64xi32> loc(#loc247) + %tmp0 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_27 = arith.constant 4096 : i32 loc(#loc248) + %tmp0_28 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc248) + %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x64xi32> loc(#loc248) + %tmp0_30 = arith.constant 128 : i32 loc(#loc249) + %tmp0_31 = arith.constant 128 : i32 loc(#loc249) + %tmp0_32 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc249) + %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<8x1xi32> loc(#loc249) + %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc250) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc250) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<8x64xi32> loc(#loc250) + %tmp0_37 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_38 = arith.constant 36864 : i32 loc(#loc251) + %tmp0_39 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc251) + %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<8x1xi32> loc(#loc251) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc252) + %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<8x64xi32> loc(#loc252) + %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc253) + %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc253) + %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254) + %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc254) + %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc254) + %tmp0_48 = arith.truncf %tmp0_47 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc254) + %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc254) + %tmp0_50 = arith.extf %tmp0_49 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc255) + %tmp6 = arith.constant 128 : i32 loc(#loc256) + %tmp6_51 = arith.constant 128 : i32 loc(#loc256) + %tmp6_52 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc256) + %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<8x1xi32> loc(#loc256) + %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc257) + %tmp6_55 = tt.broadcast %tmp6_53 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc257) + %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<8x64xi32> loc(#loc257) + %tmp6_57 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_58 = arith.constant 36864 : i32 loc(#loc258) + %tmp6_59 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc258) + %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<8x1xi32> loc(#loc258) + %tmp6_61 = tt.broadcast %tmp6_60 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc259) + %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<8x64xi32> loc(#loc259) + %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc260) + %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc260) + %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261) + %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc261) + %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc261) + %tmp6_68 = arith.truncf %tmp6_67 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc261) + %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc261) + %tmp6_70 = arith.extf %tmp6_69 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc262) + %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<8x64xf32> loc(#loc263) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<8x64xf32> loc(#loc264) + %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc265) + %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc265) + %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<8x64xf32> loc(#loc266) + %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<8x64xf32> loc(#loc267) + %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc268) + %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc268) + scf.yield %_tmp4_72, %_tmp10_74 : tensor<8x64xf32>, tensor<8x64xf32> loc(#loc39) + } loc(#loc435) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S8_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<8x64xf32>) -> tensor<8xf32> loc(#loc269) + %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc270) + %tmp10 = tt.call @"triton.language.standard.sum__fp32S8_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<8x64xf32>) -> tensor<8xf32> loc(#loc271) + %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc272) + %c0_i32_21 = arith.constant 0 : i32 loc(#loc44) + %c64_i32_22 = arith.constant 64 : i32 loc(#loc44) + %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44) + %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44) + %6 = arith.bitcast %c64_i32_22 : i32 to i32 loc(#loc44) + %7 = ub.poison : i32 loc(#loc44) + scf.for %r0_offset = %4 to %5 step %6 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc273) + %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc273) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc274) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x64xi32> loc(#loc274) + %r0_3 = arith.constant 2 : i32 loc(#loc275) + %r0_3_25 = arith.constant 2 : i32 loc(#loc275) + %r0_3_26 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc275) + %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x64xi32> loc(#loc275) + %r0_4 = arith.constant 2 : i32 loc(#loc276) + %r0_4_28 = arith.constant 2 : i32 loc(#loc276) + %r0_4_29 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc276) + %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x64xi32> loc(#loc276) + %tmp50 = arith.constant 128 : i32 loc(#loc277) + %tmp50_31 = arith.constant 128 : i32 loc(#loc277) + %tmp50_32 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc277) + %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<8x1xi32> loc(#loc277) + %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc278) + %tmp50_35 = tt.broadcast %tmp50_33 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc278) + %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<8x64xi32> loc(#loc278) + %tmp50_37 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_38 = arith.constant 36864 : i32 loc(#loc279) + %tmp50_39 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc279) + %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<8x1xi32> loc(#loc279) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc280) + %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<8x64xi32> loc(#loc280) + %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc281) + %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc281) + %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282) + %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc282) + %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc282) + %tmp50_48 = arith.truncf %tmp50_47 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc282) + %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc282) + %tmp50_50 = arith.extf %tmp50_49 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc283) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc284) + %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc284) + %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285) + %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc285) + %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc285) + %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc285) + %tmp58_56 = arith.extf %tmp58_55 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc286) + %tmp63 = arith.constant 128 : i32 loc(#loc287) + %tmp63_57 = arith.constant 128 : i32 loc(#loc287) + %tmp63_58 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc287) + %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<8x1xi32> loc(#loc287) + %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc288) + %tmp63_61 = tt.broadcast %tmp63_59 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc288) + %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<8x64xi32> loc(#loc288) + %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc289) + %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc289) + %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290) + %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc290) + %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc290) + %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc290) + %tmp66 = arith.constant 128 : i32 loc(#loc291) + %tmp66_69 = arith.constant 128 : i32 loc(#loc291) + %tmp66_70 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc291) + %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<8x1xi32> loc(#loc291) + %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc292) + %tmp66_73 = tt.broadcast %tmp66_71 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc292) + %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<8x64xi32> loc(#loc292) + %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc293) + %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc293) + %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294) + %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc294) + %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc294) + %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc294) + %tmp96 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_81 = arith.constant 4096 : i32 loc(#loc295) + %tmp96_82 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc295) + %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x64xi32> loc(#loc295) + %tmp96_84 = arith.constant 128 : i32 loc(#loc296) + %tmp96_85 = arith.constant 128 : i32 loc(#loc296) + %tmp96_86 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc296) + %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<8x1xi32> loc(#loc296) + %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc297) + %tmp96_89 = tt.broadcast %tmp96_87 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc297) + %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<8x64xi32> loc(#loc297) + %tmp96_91 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_92 = arith.constant 36864 : i32 loc(#loc298) + %tmp96_93 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc298) + %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<8x1xi32> loc(#loc298) + %tmp96_95 = tt.broadcast %tmp96_94 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc299) + %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<8x64xi32> loc(#loc299) + %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc300) + %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc300) + %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301) + %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc301) + %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc301) + %tmp96_102 = arith.truncf %tmp96_101 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc301) + %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<8x64x!tt.ptr> loc(#loc301) + %tmp96_104 = arith.extf %tmp96_103 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc302) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc303) + %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc303) + %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304) + %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc304) + %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc304) + %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc304) + %tmp102_110 = arith.extf %tmp102_109 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc305) + %tmp13 = arith.constant 0 : i64 loc(#loc306) + %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306) + %tmp14 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc307) + %tmp14_112 = arith.constant dense<0> : tensor<1x64xi64> loc(#loc307) + %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x64xi64> loc(#loc307) + %tmp15 = arith.constant 1 : i64 loc(#loc308) + %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308) + %tmp16 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc309) + %tmp16_115 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc309) + %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x64xi64> loc(#loc309) + %tmp17 = arith.constant 2 : i32 loc(#loc310) + %tmp17_117 = arith.constant 2 : i32 loc(#loc310) + %tmp17_118 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc310) + %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x64xi32> loc(#loc310) + %tmp17_120 = arith.constant 1 : i32 loc(#loc311) + %tmp17_121 = arith.constant 1 : i32 loc(#loc311) + %tmp17_122 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc311) + %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x64xi32> loc(#loc311) + %tmp17_124 = arith.constant 128 : i32 loc(#loc312) + %tmp17_125 = arith.constant 128 : i32 loc(#loc312) + %tmp17_126 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc312) + %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<8x1xi32> loc(#loc312) + %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc313) + %tmp17_129 = tt.broadcast %tmp17_127 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc313) + %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<8x64xi32> loc(#loc313) + %tmp17_131 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_132 = arith.constant 36864 : i32 loc(#loc314) + %tmp17_133 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc314) + %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<8x1xi32> loc(#loc314) + %tmp17_135 = tt.broadcast %tmp17_134 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc315) + %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<8x64xi32> loc(#loc315) + %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc316) + %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc316) + %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc317) + %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318) + %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc318) + %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc318) + %tmp17_143 = arith.truncf %tmp17_142 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc318) + %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc318) + %tmp17_145 = arith.extf %tmp17_144 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc319) + %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320) + %tmp20 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc321) + %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<8x1xf32> loc(#loc321) + %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322) + %tmp22 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc323) + %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<8x1xf32> loc(#loc323) + %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc324) + %tmp24 = tt.broadcast %tmp23 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc325) + %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<8x64xf32> loc(#loc325) + %tmp25 = arith.constant 2 : i32 loc(#loc326) + %tmp25_149 = arith.constant 2 : i32 loc(#loc326) + %tmp25_150 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc326) + %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x64xi32> loc(#loc326) + %tmp25_152 = arith.constant 1 : i32 loc(#loc327) + %tmp25_153 = arith.constant 1 : i32 loc(#loc327) + %tmp25_154 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc327) + %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x64xi32> loc(#loc327) + %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc328) + %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc329) + %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc329) + %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc330) + %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331) + %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc331) + %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc331) + %tmp25_163 = arith.truncf %tmp25_162 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc331) + %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc331) + %tmp25_165 = arith.extf %tmp25_164 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc332) + %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<8x64xf32> loc(#loc333) + %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334) + %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc334) + %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<8x64xf32> loc(#loc334) + %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335) + %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc335) + %tmp31 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc336) + %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc336) + %tmp32 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc337) + %tmp32_170 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc337) + %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x64xi64> loc(#loc337) + %tmp33 = arith.constant 2 : i64 loc(#loc338) + %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338) + %tmp34 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc339) + %tmp34_173 = arith.constant dense<2> : tensor<1x64xi64> loc(#loc339) + %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x64xi64> loc(#loc339) + %tmp35 = arith.constant 2 : i32 loc(#loc340) + %tmp35_175 = arith.constant 2 : i32 loc(#loc340) + %tmp35_176 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc340) + %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x64xi32> loc(#loc340) + %tmp35_178 = arith.constant 128 : i32 loc(#loc341) + %tmp35_179 = arith.constant 128 : i32 loc(#loc341) + %tmp35_180 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc341) + %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<8x1xi32> loc(#loc341) + %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc342) + %tmp35_183 = tt.broadcast %tmp35_181 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc342) + %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<8x64xi32> loc(#loc342) + %tmp35_185 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_186 = arith.constant 36864 : i32 loc(#loc343) + %tmp35_187 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc343) + %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<8x1xi32> loc(#loc343) + %tmp35_189 = tt.broadcast %tmp35_188 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc344) + %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<8x64xi32> loc(#loc344) + %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc345) + %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc345) + %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc346) + %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347) + %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc347) + %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc347) + %tmp35_197 = arith.truncf %tmp35_196 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc347) + %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc347) + %tmp35_199 = arith.extf %tmp35_198 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc348) + %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349) + %tmp38 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc350) + %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<8x1xf32> loc(#loc350) + %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351) + %tmp40 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc352) + %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<8x1xf32> loc(#loc352) + %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc353) + %tmp42 = tt.broadcast %tmp41 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc354) + %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<8x64xf32> loc(#loc354) + %tmp43 = arith.constant 2 : i32 loc(#loc355) + %tmp43_203 = arith.constant 2 : i32 loc(#loc355) + %tmp43_204 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc355) + %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x64xi32> loc(#loc355) + %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc356) + %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc357) + %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc357) + %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc358) + %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359) + %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc359) + %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc359) + %tmp43_213 = arith.truncf %tmp43_212 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc359) + %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc359) + %tmp43_215 = arith.extf %tmp43_214 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc360) + %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<8x64xf32> loc(#loc361) + %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362) + %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc362) + %tmp48 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc363) + %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc363) + %tmp49 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc364) + %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc364) + %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365) + %tmp53 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc366) + %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<8x1xf32> loc(#loc366) + %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367) + %tmp55 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc368) + %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<8x1xf32> loc(#loc368) + %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc369) + %tmp57 = tt.broadcast %tmp56 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc370) + %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<8x64xf32> loc(#loc370) + %tmp60 = tt.broadcast %tmp58_56 : tensor<1x64xf32> -> tensor<8x64xf32> loc(#loc371) + %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<8x64xf32> loc(#loc371) + %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<8x64xf32> loc(#loc372) + %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<8x64xf32> loc(#loc373) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<8x64xf32> loc(#loc374) + %tmp70 = arith.constant 2 : i32 loc(#loc375) + %tmp70_223 = arith.constant 2 : i32 loc(#loc375) + %tmp70_224 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc375) + %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x64xi32> loc(#loc375) + %tmp70_226 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_227 = arith.constant 4097 : i32 loc(#loc376) + %tmp70_228 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc376) + %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x64xi32> loc(#loc376) + %tmp70_230 = arith.constant 128 : i32 loc(#loc377) + %tmp70_231 = arith.constant 128 : i32 loc(#loc377) + %tmp70_232 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc377) + %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<8x1xi32> loc(#loc377) + %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc378) + %tmp70_235 = tt.broadcast %tmp70_233 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc378) + %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<8x64xi32> loc(#loc378) + %tmp70_237 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_238 = arith.constant 36864 : i32 loc(#loc379) + %tmp70_239 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc379) + %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<8x1xi32> loc(#loc379) + %tmp70_241 = tt.broadcast %tmp70_240 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc380) + %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<8x64xi32> loc(#loc380) + %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc381) + %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc381) + %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc382) + %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383) + %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc383) + %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc383) + %tmp70_249 = arith.truncf %tmp70_248 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc383) + %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc383) + %tmp70_251 = arith.extf %tmp70_250 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc384) + %tmp72 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc385) + %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<8x1xf32> loc(#loc385) + %tmp73 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc386) + %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<8x1xf32> loc(#loc386) + %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc387) + %tmp75 = tt.broadcast %tmp74 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc388) + %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<8x64xf32> loc(#loc388) + %tmp76 = arith.constant 2 : i32 loc(#loc389) + %tmp76_255 = arith.constant 2 : i32 loc(#loc389) + %tmp76_256 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc389) + %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x64xi32> loc(#loc389) + %tmp76_258 = arith.constant 1 : i32 loc(#loc390) + %tmp76_259 = arith.constant 1 : i32 loc(#loc390) + %tmp76_260 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc390) + %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x64xi32> loc(#loc390) + %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc391) + %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc392) + %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc392) + %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc393) + %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394) + %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc394) + %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc394) + %tmp76_269 = arith.truncf %tmp76_268 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc394) + %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc394) + %tmp76_271 = arith.extf %tmp76_270 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc395) + %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<8x64xf32> loc(#loc396) + %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397) + %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc397) + %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<8x64xf32> loc(#loc397) + %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398) + %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc398) + %tmp82 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc399) + %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc399) + %tmp83 = arith.constant 2 : i32 loc(#loc400) + %tmp83_276 = arith.constant 2 : i32 loc(#loc400) + %tmp83_277 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc400) + %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x64xi32> loc(#loc400) + %tmp83_279 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_280 = arith.constant 4096 : i32 loc(#loc401) + %tmp83_281 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc401) + %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x64xi32> loc(#loc401) + %tmp83_283 = arith.constant 128 : i32 loc(#loc402) + %tmp83_284 = arith.constant 128 : i32 loc(#loc402) + %tmp83_285 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc402) + %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<8x1xi32> loc(#loc402) + %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc403) + %tmp83_288 = tt.broadcast %tmp83_286 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc403) + %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<8x64xi32> loc(#loc403) + %tmp83_290 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_291 = arith.constant 36864 : i32 loc(#loc404) + %tmp83_292 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc404) + %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<8x1xi32> loc(#loc404) + %tmp83_294 = tt.broadcast %tmp83_293 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc405) + %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<8x64xi32> loc(#loc405) + %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc406) + %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc406) + %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc407) + %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408) + %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc408) + %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc408) + %tmp83_302 = arith.truncf %tmp83_301 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc408) + %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc408) + %tmp83_304 = arith.extf %tmp83_303 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc409) + %tmp85 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc410) + %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<8x1xf32> loc(#loc410) + %tmp86 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc411) + %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<8x1xf32> loc(#loc411) + %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc412) + %tmp88 = tt.broadcast %tmp87 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc413) + %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<8x64xf32> loc(#loc413) + %tmp89 = arith.constant 2 : i32 loc(#loc414) + %tmp89_308 = arith.constant 2 : i32 loc(#loc414) + %tmp89_309 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc414) + %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x64xi32> loc(#loc414) + %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc415) + %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc416) + %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc416) + %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc417) + %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418) + %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc418) + %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc418) + %tmp89_318 = arith.truncf %tmp89_317 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc418) + %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc418) + %tmp89_320 = arith.extf %tmp89_319 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc419) + %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<8x64xf32> loc(#loc420) + %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421) + %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc421) + %tmp94 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc422) + %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc422) + %tmp95 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc423) + %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc423) + %tmp98 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc424) + %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<8x1xf32> loc(#loc424) + %tmp99 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc425) + %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<8x1xf32> loc(#loc425) + %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc426) + %tmp101 = tt.broadcast %tmp100 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc427) + %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<8x64xf32> loc(#loc427) + %tmp104 = tt.broadcast %tmp102_110 : tensor<1x64xf32> -> tensor<8x64xf32> loc(#loc428) + %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<8x64xf32> loc(#loc428) + %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<8x64xf32> loc(#loc429) + %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<8x64xf32> loc(#loc430) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<8x64xf32> loc(#loc431) + %c128_i32 = arith.constant 128 : i32 loc(#loc204) + %c128_i32_328 = arith.constant 128 : i32 loc(#loc204) + %cst = arith.constant dense<128> : tensor<8x1xi32> loc(#loc204) + %8 = arith.muli %cst, %xindex_7 : tensor<8x1xi32> loc(#loc204) + %9 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc205) + %10 = tt.broadcast %8 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc205) + %11 = arith.addi %9, %10 : tensor<8x64xi32> loc(#loc205) + %12 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc206) + %13 = tt.addptr %12, %11 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc206) + %14 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc207) + %15 = arith.truncf %tmp68 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc207) + tt.store %13, %15, %14 : tensor<8x64x!tt.ptr> loc(#loc207) + %c128_i32_329 = arith.constant 128 : i32 loc(#loc208) + %c128_i32_330 = arith.constant 128 : i32 loc(#loc208) + %cst_331 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc208) + %16 = arith.muli %cst_331, %xindex_7 : tensor<8x1xi32> loc(#loc208) + %17 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc209) + %18 = tt.broadcast %16 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc209) + %19 = arith.addi %17, %18 : tensor<8x64xi32> loc(#loc209) + %20 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc210) + %21 = tt.addptr %20, %19 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc210) + %22 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc211) + %23 = arith.truncf %tmp110 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc211) + tt.store %21, %23, %22 : tensor<8x64x!tt.ptr> loc(#loc211) + } loc(#loc44) + tt.return loc(#loc212) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S8_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x64xf32> loc("input"(#loc213))) -> tensor<8xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214) + tt.reduce.return %2 : f32 loc(#loc214) + }) : (tensor<8x64xf32>) -> tensor<8xf32> loc(#loc214) + tt.return %0 : tensor<8xf32> loc(#loc216) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xf32> loc(#loc217) + tt.return %1 : tensor<8xf32> loc(#loc217) + } loc(#loc213) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc219) + tt.return %0 : f32 loc(#loc220) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc221) + tt.return %1 : f32 loc(#loc221) + } loc(#loc218) +} loc(#loc) +#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55) +#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66) +#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81) +#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57) +#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55) +#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63) +#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95) +#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42) +#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44) +#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55) +#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66) +#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81) +#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24) +#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24) +#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32) +#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53) +#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59) +#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91) +#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42) +#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24) +#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24) +#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33) +#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43) +#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39) +#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11) +#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4) +#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11) +#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4) +#loc231 = loc("xnumel"(#loc1)) +#loc232 = loc("r0_numel"(#loc2)) +#loc233 = loc("xoffset"(#loc3)) +#loc234 = loc("xoffset"(#loc4)) +#loc235 = loc("xindex"(#loc5)) +#loc236 = loc("xindex"(#loc6)) +#loc237 = loc("xindex"(#loc7)) +#loc238 = loc("xmask"(#loc8)) +#loc239 = loc("r0_base"(#loc9)) +#loc240 = loc("r0_base"(#loc10)) +#loc241 = loc("x0"(#loc11)) +#loc242 = loc("x1"(#loc12)) +#loc243 = loc("_tmp4"(#loc13)) +#loc244 = loc("_tmp10"(#loc14)) +#loc245 = loc("_tmp4"(#loc15)) +#loc246 = loc("r0_index"(#loc16)) +#loc247 = loc("r0_mask"(#loc17)) +#loc248 = loc("tmp0"(#loc18)) +#loc249 = loc("tmp0"(#loc19)) +#loc250 = loc("tmp0"(#loc20)) +#loc251 = loc("tmp0"(#loc21)) +#loc252 = loc("tmp0"(#loc22)) +#loc253 = loc("tmp0"(#loc23)) +#loc254 = loc("tmp0"(#loc24)) +#loc255 = loc("tmp0"(#loc25)) +#loc256 = loc("tmp6"(#loc26)) +#loc257 = loc("tmp6"(#loc27)) +#loc258 = loc("tmp6"(#loc28)) +#loc259 = loc("tmp6"(#loc29)) +#loc260 = loc("tmp6"(#loc30)) +#loc261 = loc("tmp6"(#loc31)) +#loc262 = loc("tmp6"(#loc32)) +#loc263 = loc("tmp2"(#loc33)) +#loc264 = loc("tmp5"(#loc34)) +#loc265 = loc("_tmp4"(#loc35)) +#loc266 = loc("tmp8"(#loc36)) +#loc267 = loc("tmp11"(#loc37)) +#loc268 = loc("_tmp10"(#loc38)) +#loc269 = loc("tmp4"(#loc40)) +#loc270 = loc("tmp4"(#loc41)) +#loc271 = loc("tmp10"(#loc42)) +#loc272 = loc("tmp10"(#loc43)) +#loc273 = loc("r0_index"(#loc45)) +#loc274 = loc("r0_mask"(#loc46)) +#loc275 = loc("r0_3"(#loc47)) +#loc276 = loc("r0_4"(#loc48)) +#loc277 = loc("tmp50"(#loc49)) +#loc278 = loc("tmp50"(#loc50)) +#loc279 = loc("tmp50"(#loc51)) +#loc280 = loc("tmp50"(#loc52)) +#loc281 = loc("tmp50"(#loc53)) +#loc282 = loc("tmp50"(#loc54)) +#loc283 = loc("tmp50"(#loc55)) +#loc284 = loc("tmp58"(#loc56)) +#loc285 = loc("tmp58"(#loc57)) +#loc286 = loc("tmp58"(#loc58)) +#loc287 = loc("tmp63"(#loc59)) +#loc288 = loc("tmp63"(#loc60)) +#loc289 = loc("tmp63"(#loc61)) +#loc290 = loc("tmp63"(#loc62)) +#loc291 = loc("tmp66"(#loc63)) +#loc292 = loc("tmp66"(#loc64)) +#loc293 = loc("tmp66"(#loc65)) +#loc294 = loc("tmp66"(#loc66)) +#loc295 = loc("tmp96"(#loc67)) +#loc296 = loc("tmp96"(#loc68)) +#loc297 = loc("tmp96"(#loc69)) +#loc298 = loc("tmp96"(#loc70)) +#loc299 = loc("tmp96"(#loc71)) +#loc300 = loc("tmp96"(#loc72)) +#loc301 = loc("tmp96"(#loc73)) +#loc302 = loc("tmp96"(#loc74)) +#loc303 = loc("tmp102"(#loc75)) +#loc304 = loc("tmp102"(#loc76)) +#loc305 = loc("tmp102"(#loc77)) +#loc306 = loc("tmp13"(#loc78)) +#loc307 = loc("tmp14"(#loc79)) +#loc308 = loc("tmp15"(#loc80)) +#loc309 = loc("tmp16"(#loc81)) +#loc310 = loc("tmp17"(#loc82)) +#loc311 = loc("tmp17"(#loc83)) +#loc312 = loc("tmp17"(#loc84)) +#loc313 = loc("tmp17"(#loc85)) +#loc314 = loc("tmp17"(#loc86)) +#loc315 = loc("tmp17"(#loc87)) +#loc316 = loc("tmp17"(#loc88)) +#loc317 = loc("tmp17"(#loc89)) +#loc318 = loc("tmp17"(#loc90)) +#loc319 = loc("tmp17"(#loc91)) +#loc320 = loc("tmp19"(#loc92)) +#loc321 = loc("tmp20"(#loc93)) +#loc322 = loc("tmp21"(#loc94)) +#loc323 = loc("tmp22"(#loc95)) +#loc324 = loc("tmp23"(#loc96)) +#loc325 = loc("tmp24"(#loc97)) +#loc326 = loc("tmp25"(#loc98)) +#loc327 = loc("tmp25"(#loc99)) +#loc328 = loc("tmp25"(#loc100)) +#loc329 = loc("tmp25"(#loc101)) +#loc330 = loc("tmp25"(#loc102)) +#loc331 = loc("tmp25"(#loc103)) +#loc332 = loc("tmp25"(#loc104)) +#loc333 = loc("tmp27"(#loc105)) +#loc334 = loc("tmp29"(#loc106)) +#loc335 = loc("tmp30"(#loc107)) +#loc336 = loc("tmp31"(#loc108)) +#loc337 = loc("tmp32"(#loc109)) +#loc338 = loc("tmp33"(#loc110)) +#loc339 = loc("tmp34"(#loc111)) +#loc340 = loc("tmp35"(#loc112)) +#loc341 = loc("tmp35"(#loc113)) +#loc342 = loc("tmp35"(#loc114)) +#loc343 = loc("tmp35"(#loc115)) +#loc344 = loc("tmp35"(#loc116)) +#loc345 = loc("tmp35"(#loc117)) +#loc346 = loc("tmp35"(#loc118)) +#loc347 = loc("tmp35"(#loc119)) +#loc348 = loc("tmp35"(#loc120)) +#loc349 = loc("tmp37"(#loc121)) +#loc350 = loc("tmp38"(#loc122)) +#loc351 = loc("tmp39"(#loc123)) +#loc352 = loc("tmp40"(#loc124)) +#loc353 = loc("tmp41"(#loc125)) +#loc354 = loc("tmp42"(#loc126)) +#loc355 = loc("tmp43"(#loc127)) +#loc356 = loc("tmp43"(#loc128)) +#loc357 = loc("tmp43"(#loc129)) +#loc358 = loc("tmp43"(#loc130)) +#loc359 = loc("tmp43"(#loc131)) +#loc360 = loc("tmp43"(#loc132)) +#loc361 = loc("tmp45"(#loc133)) +#loc362 = loc("tmp47"(#loc134)) +#loc363 = loc("tmp48"(#loc135)) +#loc364 = loc("tmp49"(#loc136)) +#loc365 = loc("tmp52"(#loc137)) +#loc366 = loc("tmp53"(#loc138)) +#loc367 = loc("tmp54"(#loc139)) +#loc368 = loc("tmp55"(#loc140)) +#loc369 = loc("tmp56"(#loc141)) +#loc370 = loc("tmp57"(#loc142)) +#loc371 = loc("tmp60"(#loc143)) +#loc372 = loc("tmp64"(#loc144)) +#loc373 = loc("tmp67"(#loc145)) +#loc374 = loc("tmp68"(#loc146)) +#loc375 = loc("tmp70"(#loc147)) +#loc376 = loc("tmp70"(#loc148)) +#loc377 = loc("tmp70"(#loc149)) +#loc378 = loc("tmp70"(#loc150)) +#loc379 = loc("tmp70"(#loc151)) +#loc380 = loc("tmp70"(#loc152)) +#loc381 = loc("tmp70"(#loc153)) +#loc382 = loc("tmp70"(#loc154)) +#loc383 = loc("tmp70"(#loc155)) +#loc384 = loc("tmp70"(#loc156)) +#loc385 = loc("tmp72"(#loc157)) +#loc386 = loc("tmp73"(#loc158)) +#loc387 = loc("tmp74"(#loc159)) +#loc388 = loc("tmp75"(#loc160)) +#loc389 = loc("tmp76"(#loc161)) +#loc390 = loc("tmp76"(#loc162)) +#loc391 = loc("tmp76"(#loc163)) +#loc392 = loc("tmp76"(#loc164)) +#loc393 = loc("tmp76"(#loc165)) +#loc394 = loc("tmp76"(#loc166)) +#loc395 = loc("tmp76"(#loc167)) +#loc396 = loc("tmp78"(#loc168)) +#loc397 = loc("tmp80"(#loc169)) +#loc398 = loc("tmp81"(#loc170)) +#loc399 = loc("tmp82"(#loc171)) +#loc400 = loc("tmp83"(#loc172)) +#loc401 = loc("tmp83"(#loc173)) +#loc402 = loc("tmp83"(#loc174)) +#loc403 = loc("tmp83"(#loc175)) +#loc404 = loc("tmp83"(#loc176)) +#loc405 = loc("tmp83"(#loc177)) +#loc406 = loc("tmp83"(#loc178)) +#loc407 = loc("tmp83"(#loc179)) +#loc408 = loc("tmp83"(#loc180)) +#loc409 = loc("tmp83"(#loc181)) +#loc410 = loc("tmp85"(#loc182)) +#loc411 = loc("tmp86"(#loc183)) +#loc412 = loc("tmp87"(#loc184)) +#loc413 = loc("tmp88"(#loc185)) +#loc414 = loc("tmp89"(#loc186)) +#loc415 = loc("tmp89"(#loc187)) +#loc416 = loc("tmp89"(#loc188)) +#loc417 = loc("tmp89"(#loc189)) +#loc418 = loc("tmp89"(#loc190)) +#loc419 = loc("tmp89"(#loc191)) +#loc420 = loc("tmp91"(#loc192)) +#loc421 = loc("tmp93"(#loc193)) +#loc422 = loc("tmp94"(#loc194)) +#loc423 = loc("tmp95"(#loc195)) +#loc424 = loc("tmp98"(#loc196)) +#loc425 = loc("tmp99"(#loc197)) +#loc426 = loc("tmp100"(#loc198)) +#loc427 = loc("tmp101"(#loc199)) +#loc428 = loc("tmp104"(#loc200)) +#loc429 = loc("tmp107"(#loc201)) +#loc430 = loc("tmp109"(#loc202)) +#loc431 = loc("tmp110"(#loc203)) +#loc435 = loc("_tmp10"(#loc245)) diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..66ef1eff47bfa03d4850a8b2681875bd307846e6 --- /dev/null +++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir @@ -0,0 +1,547 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}> +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc147 = loc("in_out_ptr0"(#loc)) +#loc148 = loc("in_out_ptr1"(#loc)) +#loc149 = loc("in_ptr0"(#loc)) +#loc150 = loc("in_ptr1"(#loc)) +#loc151 = loc("in_ptr2"(#loc)) +#loc152 = loc("in_ptr3"(#loc)) +#loc153 = loc("in_ptr4"(#loc)) +#loc154 = loc("xnumel"(#loc)) +#loc155 = loc("r0_numel"(#loc)) +#loc185 = loc("tmp4"(#loc33)) +#loc187 = loc("tmp10"(#loc36)) +#loc292 = loc(callsite(#loc1 at #loc185)) +#loc294 = loc(callsite(#loc1 at #loc187)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4097> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x64xbf16, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x64xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<2> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<36864> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<36864> : tensor<8x1xi32, #blocked1> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<8x1xi32, #blocked1> loc(#loc1) + %cst_8 = arith.constant dense<4096> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<32> : tensor<8x1xi32, #blocked1> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<8x64xbf16, #blocked1> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<8x64xbf16, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_16 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32, #blocked1> loc(#loc1) + %cst_17 = arith.constant dense<1.280000e+02> : tensor<8x1xf32, #blocked1> loc(#loc1) + %cst_18 = arith.constant dense<0.000000e+00> : tensor<8x64xf32, #blocked> loc(#loc1) + %cst_19 = arith.constant dense<0.000000e+00> : tensor<8x64xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc156) + %xoffset_20 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc157) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158) + %xindex_21 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158) + %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc158) + %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc158) + %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc159) + %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<8x1xi32, #blocked> loc(#loc159) + %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<8x1xi32, #blocked1> loc(#loc159) + %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<8x1xi32, #blocked> loc(#loc159) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160) + %r0_base_28 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160) + %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc160) + %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc160) + %x0 = arith.remsi %xindex_26, %cst_13 : tensor<8x1xi32, #blocked1> loc(#loc161) + %x0_31 = arith.remsi %xindex_27, %cst_12 : tensor<8x1xi32, #blocked> loc(#loc161) + %x1 = arith.divsi %xindex_26, %cst_13 : tensor<8x1xi32, #blocked1> loc(#loc162) + %x1_32 = arith.divsi %xindex_27, %cst_12 : tensor<8x1xi32, #blocked> loc(#loc162) + %tmp0 = arith.muli %x0, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc163) + %tmp0_33 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc164) + %tmp0_34 = arith.muli %x1, %cst_5 : tensor<8x1xi32, #blocked1> loc(#loc165) + %tmp0_35 = tt.broadcast %tmp0_34 : tensor<8x1xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc166) + %tmp0_36 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr, #blocked1> loc(#loc167) + %_tmp10:2 = scf.for %_tmp10_51 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg10 = %cst_19, %arg11 = %cst_19) -> (tensor<8x64xf32, #blocked1>, tensor<8x64xf32, #blocked1>) : i32 { + %r0_index = tt.splat %_tmp10_51 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc169) + %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc169) + %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc170) + %tmp0_53 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc171) + %tmp0_54 = tt.broadcast %tmp0_53 : tensor<1x64xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc164) + %tmp0_55 = arith.addi %tmp0_54, %tmp0_33 : tensor<8x64xi32, #blocked1> loc(#loc164) + %tmp0_56 = arith.addi %tmp0_55, %tmp0_35 : tensor<8x64xi32, #blocked1> loc(#loc166) + %tmp0_57 = tt.addptr %tmp0_36, %tmp0_56 : tensor<8x64x!tt.ptr, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc167) + %tmp0_58 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<8x64xi1, #blocked1> loc(#loc172) + %tmp0_59 = tt.load %tmp0_57, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked1> loc(#loc172) + %tmp0_60 = arith.extf %tmp0_59 : tensor<8x64xbf16, #blocked1> to tensor<8x64xf32, #blocked1> loc(#loc173) + %tmp6 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc174) + %tmp6_61 = arith.addi %tmp6, %tmp0_33 : tensor<8x64xi32, #blocked1> loc(#loc174) + %tmp6_62 = arith.addi %tmp6_61, %tmp0_35 : tensor<8x64xi32, #blocked1> loc(#loc175) + %tmp6_63 = tt.addptr %tmp0_36, %tmp6_62 : tensor<8x64x!tt.ptr, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc176) + %tmp6_64 = tt.load %tmp6_63, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked1> loc(#loc177) + %tmp6_65 = arith.extf %tmp6_64 : tensor<8x64xbf16, #blocked1> to tensor<8x64xf32, #blocked1> loc(#loc178) + %tmp2 = arith.mulf %tmp0_60, %tmp0_60 : tensor<8x64xf32, #blocked1> loc(#loc179) + %tmp5 = arith.addf %arg10, %tmp2 : tensor<8x64xf32, #blocked1> loc(#loc180) + %_tmp4 = arith.select %tmp0_58, %tmp5, %arg10 : tensor<8x64xi1, #blocked1>, tensor<8x64xf32, #blocked1> loc(#loc181) + %tmp8 = arith.mulf %tmp6_65, %tmp6_65 : tensor<8x64xf32, #blocked1> loc(#loc182) + %tmp11 = arith.addf %arg11, %tmp8 : tensor<8x64xf32, #blocked1> loc(#loc183) + %_tmp10_66 = arith.select %tmp0_58, %tmp11, %arg11 : tensor<8x64xi1, #blocked1>, tensor<8x64xf32, #blocked1> loc(#loc184) + scf.yield %_tmp4, %_tmp10_66 : tensor<8x64xf32, #blocked1>, tensor<8x64xf32, #blocked1> loc(#loc31) + } loc(#loc290) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_51: f32 loc(callsite(#loc1 at #loc185)), %tmp4_52: f32 loc(callsite(#loc1 at #loc185))): + %tmp4_53 = arith.addf %tmp4_51, %tmp4_52 : f32 loc(#loc297) + tt.reduce.return %tmp4_53 : f32 loc(#loc291) + }) : (tensor<8x64xf32, #blocked1>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291) + %tmp4_37 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc186) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_51: f32 loc(callsite(#loc1 at #loc187)), %tmp10_52: f32 loc(callsite(#loc1 at #loc187))): + %tmp10_53 = arith.addf %tmp10_51, %tmp10_52 : f32 loc(#loc298) + tt.reduce.return %tmp10_53 : f32 loc(#loc293) + }) : (tensor<8x64xf32, #blocked1>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293) + %tmp10_38 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc188) + %tmp50 = arith.muli %x0_31, %cst_6 : tensor<8x1xi32, #blocked> loc(#loc189) + %tmp50_39 = tt.broadcast %tmp50 : tensor<8x1xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc190) + %tmp50_40 = arith.muli %x1_32, %cst_4 : tensor<8x1xi32, #blocked> loc(#loc191) + %tmp50_41 = tt.broadcast %tmp50_40 : tensor<8x1xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc192) + %tmp50_42 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr, #blocked> loc(#loc193) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked> loc(#loc194) + %tmp58_43 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc194) + %tmp63 = arith.muli %x1, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc195) + %tmp63_44 = tt.broadcast %tmp63 : tensor<8x1xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc196) + %tmp63_45 = tt.splat %in_ptr2 : !tt.ptr -> tensor<8x64x!tt.ptr, #blocked1> loc(#loc197) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<8x64x!tt.ptr, #blocked1> loc(#loc198) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked> loc(#loc199) + %tmp102_46 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc199) + %tmp20 = arith.divf %tmp10_38, %cst_17 : tensor<8x1xf32, #blocked1> loc(#loc200) + %tmp22 = arith.addf %tmp20, %cst_16 : tensor<8x1xf32, #blocked1> loc(#loc201) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32, #blocked1>) -> tensor<8x1xf32, #blocked1> loc(#loc202) + %tmp24 = ttg.convert_layout %tmp23 : tensor<8x1xf32, #blocked1> -> tensor<8x1xf32, #blocked> loc(#loc203) + %tmp24_47 = tt.broadcast %tmp24 : tensor<8x1xf32, #blocked> -> tensor<8x64xf32, #blocked> loc(#loc203) + %tmp24_48 = tt.broadcast %tmp23 : tensor<8x1xf32, #blocked1> -> tensor<8x64xf32, #blocked1> loc(#loc203) + %tmp72 = arith.divf %tmp4_37, %cst_17 : tensor<8x1xf32, #blocked1> loc(#loc204) + %tmp73 = arith.addf %tmp72, %cst_16 : tensor<8x1xf32, #blocked1> loc(#loc205) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32, #blocked1>) -> tensor<8x1xf32, #blocked1> loc(#loc206) + %tmp75 = ttg.convert_layout %tmp74 : tensor<8x1xf32, #blocked1> -> tensor<8x1xf32, #blocked> loc(#loc207) + %tmp75_49 = tt.broadcast %tmp75 : tensor<8x1xf32, #blocked> -> tensor<8x64xf32, #blocked> loc(#loc207) + %tmp75_50 = tt.broadcast %tmp74 : tensor<8x1xf32, #blocked1> -> tensor<8x64xf32, #blocked1> loc(#loc207) + %0 = arith.muli %xindex_26, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc57) + %1 = tt.broadcast %0 : tensor<8x1xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc58) + %2 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr, #blocked1> loc(#loc59) + %3 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<8x64x!tt.ptr, #blocked1> loc(#loc60) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked1> loc(#loc208) + %r0_index_51 = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked> loc(#loc208) + %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc208) + %r0_index_53 = arith.addi %r0_index_51, %r0_base_30 : tensor<1x64xi32, #blocked> loc(#loc208) + %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc209) + %r0_mask_54 = arith.cmpi slt, %r0_index_53, %cst_10 : tensor<1x64xi32, #blocked> loc(#loc209) + %r0_3 = arith.remsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc210) + %r0_4 = arith.divsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc211) + %tmp50_55 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc190) + %tmp50_56 = arith.addi %tmp50_55, %tmp0_33 : tensor<8x64xi32, #blocked1> loc(#loc190) + %tmp50_57 = arith.addi %tmp50_56, %tmp0_35 : tensor<8x64xi32, #blocked1> loc(#loc192) + %tmp50_58 = tt.addptr %tmp0_36, %tmp50_57 : tensor<8x64x!tt.ptr, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc193) + %tmp50_59 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<8x64xi1, #blocked1> loc(#loc212) + %tmp50_60 = tt.load %tmp50_58, %tmp50_59, %cst_14 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked1> loc(#loc212) + %tmp50_61 = arith.extf %tmp50_60 : tensor<8x64xbf16, #blocked1> to tensor<8x64xf32, #blocked1> loc(#loc213) + %tmp58_62 = tt.addptr %tmp58_43, %r0_index_52 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc194) + %tmp58_63 = tt.load %tmp58_62, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr, #blocked1> loc(#loc214) + %tmp58_64 = arith.extf %tmp58_63 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc215) + %tmp63_65 = arith.addi %tmp50_55, %tmp63_44 : tensor<8x64xi32, #blocked1> loc(#loc196) + %tmp63_66 = tt.addptr %tmp63_45, %tmp63_65 : tensor<8x64x!tt.ptr, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc197) + %tmp63_67 = tt.load %tmp63_66, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked1> loc(#loc216) + %tmp66_68 = tt.addptr %tmp66, %tmp63_65 : tensor<8x64x!tt.ptr, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc198) + %tmp66_69 = tt.load %tmp66_68, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked1> loc(#loc217) + %tmp66_70 = ttg.convert_layout %tmp66_69 : tensor<8x64xf32, #blocked1> -> tensor<8x64xf32, #blocked> loc(#loc217) + %tmp96 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc218) + %tmp96_71 = tt.broadcast %tmp96 : tensor<1x64xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc219) + %tmp96_72 = arith.addi %tmp96_71, %tmp0_33 : tensor<8x64xi32, #blocked1> loc(#loc219) + %tmp96_73 = arith.addi %tmp96_72, %tmp0_35 : tensor<8x64xi32, #blocked1> loc(#loc220) + %tmp96_74 = tt.addptr %tmp0_36, %tmp96_73 : tensor<8x64x!tt.ptr, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc221) + %tmp96_75 = tt.load %tmp96_74, %tmp50_59, %cst_14 evictionPolicy = evict_first : tensor<8x64x!tt.ptr, #blocked1> loc(#loc222) + %tmp96_76 = arith.extf %tmp96_75 : tensor<8x64xbf16, #blocked1> to tensor<8x64xf32, #blocked1> loc(#loc223) + %tmp102_77 = tt.addptr %tmp102_46, %r0_index_52 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc199) + %tmp102_78 = tt.load %tmp102_77, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr, #blocked1> loc(#loc224) + %tmp102_79 = arith.extf %tmp102_78 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc225) + %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc226) + %tmp16_80 = arith.cmpi slt, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc226) + %tmp17 = arith.muli %r0_4, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc227) + %tmp17_81 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32, #blocked> loc(#loc228) + %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x64xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc229) + %tmp17_83 = arith.addi %tmp17_82, %tmp50_39 : tensor<8x64xi32, #blocked> loc(#loc229) + %tmp17_84 = arith.addi %tmp17_83, %tmp50_41 : tensor<8x64xi32, #blocked> loc(#loc230) + %tmp17_85 = tt.addptr %tmp50_42, %tmp17_84 : tensor<8x64x!tt.ptr, #blocked>, tensor<8x64xi32, #blocked> loc(#loc231) + %tmp17_86 = arith.andi %r0_mask_54, %tmp16_80 : tensor<1x64xi1, #blocked> loc(#loc232) + %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x64xi1, #blocked> -> tensor<8x64xi1, #blocked> loc(#loc233) + %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked> loc(#loc233) + %tmp17_89 = arith.extf %tmp17_88 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc234) + %tmp24_90 = arith.mulf %tmp17_89, %tmp24_47 : tensor<8x64xf32, #blocked> loc(#loc203) + %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc235) + %tmp25_91 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr, #blocked> -> tensor<8x64x!tt.ptr, #blocked> loc(#loc235) + %tmp25_92 = tt.load %tmp25_91, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked> loc(#loc236) + %tmp25_93 = arith.extf %tmp25_92 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc237) + %tmp27 = arith.mulf %tmp24_90, %tmp25_93 : tensor<8x64xf32, #blocked> loc(#loc238) + %tmp29 = arith.subf %cst_18, %tmp27 : tensor<8x64xf32, #blocked> loc(#loc239) + %tmp31 = tt.broadcast %tmp16_80 : tensor<1x64xi1, #blocked> -> tensor<8x64xi1, #blocked> loc(#loc240) + %tmp32 = arith.cmpi sge, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc241) + %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc242) + %tmp35_94 = arith.addi %tmp35, %tmp50_39 : tensor<8x64xi32, #blocked> loc(#loc242) + %tmp35_95 = arith.addi %tmp35_94, %tmp50_41 : tensor<8x64xi32, #blocked> loc(#loc243) + %tmp35_96 = tt.addptr %tmp50_42, %tmp35_95 : tensor<8x64x!tt.ptr, #blocked>, tensor<8x64xi32, #blocked> loc(#loc244) + %tmp35_97 = arith.andi %r0_mask_54, %tmp32 : tensor<1x64xi1, #blocked> loc(#loc245) + %tmp35_98 = tt.broadcast %tmp35_97 : tensor<1x64xi1, #blocked> -> tensor<8x64xi1, #blocked> loc(#loc246) + %tmp35_99 = tt.load %tmp35_96, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked> loc(#loc246) + %tmp35_100 = arith.extf %tmp35_99 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc247) + %tmp42 = arith.mulf %tmp35_100, %tmp24_47 : tensor<8x64xf32, #blocked> loc(#loc248) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc249) + %tmp43_101 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr, #blocked> -> tensor<8x64x!tt.ptr, #blocked> loc(#loc249) + %tmp43_102 = tt.load %tmp43_101, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked> loc(#loc250) + %tmp43_103 = arith.extf %tmp43_102 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc251) + %tmp45 = arith.mulf %tmp42, %tmp43_103 : tensor<8x64xf32, #blocked> loc(#loc252) + %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1, #blocked> -> tensor<8x64xi1, #blocked> loc(#loc253) + %tmp48_104 = arith.select %tmp48, %tmp45, %cst_18 : tensor<8x64xi1, #blocked>, tensor<8x64xf32, #blocked> loc(#loc253) + %tmp49 = arith.select %tmp31, %tmp29, %tmp48_104 : tensor<8x64xi1, #blocked>, tensor<8x64xf32, #blocked> loc(#loc295) + %tmp57 = arith.mulf %tmp50_61, %tmp24_48 : tensor<8x64xf32, #blocked1> loc(#loc255) + %tmp60 = tt.broadcast %tmp58_64 : tensor<1x64xf32, #blocked1> -> tensor<8x64xf32, #blocked1> loc(#loc256) + %tmp60_105 = arith.mulf %tmp57, %tmp60 : tensor<8x64xf32, #blocked1> loc(#loc256) + %tmp64 = arith.mulf %tmp60_105, %tmp63_67 : tensor<8x64xf32, #blocked1> loc(#loc257) + %tmp64_106 = ttg.convert_layout %tmp64 : tensor<8x64xf32, #blocked1> -> tensor<8x64xf32, #blocked> loc(#loc257) + %tmp67 = arith.mulf %tmp49, %tmp66_70 : tensor<8x64xf32, #blocked> loc(#loc258) + %tmp68 = arith.addf %tmp64_106, %tmp67 : tensor<8x64xf32, #blocked> loc(#loc259) + %tmp70 = arith.addi %tmp17, %cst : tensor<1x64xi32, #blocked> loc(#loc260) + %tmp70_107 = tt.broadcast %tmp70 : tensor<1x64xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc261) + %tmp70_108 = arith.addi %tmp70_107, %tmp50_39 : tensor<8x64xi32, #blocked> loc(#loc261) + %tmp70_109 = arith.addi %tmp70_108, %tmp50_41 : tensor<8x64xi32, #blocked> loc(#loc262) + %tmp70_110 = tt.addptr %tmp50_42, %tmp70_109 : tensor<8x64x!tt.ptr, #blocked>, tensor<8x64xi32, #blocked> loc(#loc263) + %tmp70_111 = tt.load %tmp70_110, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked> loc(#loc264) + %tmp70_112 = arith.extf %tmp70_111 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc265) + %tmp75_113 = arith.mulf %tmp70_112, %tmp75_49 : tensor<8x64xf32, #blocked> loc(#loc207) + %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc266) + %tmp76_114 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr, #blocked> -> tensor<8x64x!tt.ptr, #blocked> loc(#loc266) + %tmp76_115 = tt.load %tmp76_114, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked> loc(#loc267) + %tmp76_116 = arith.extf %tmp76_115 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc268) + %tmp78 = arith.mulf %tmp75_113, %tmp76_116 : tensor<8x64xf32, #blocked> loc(#loc269) + %tmp80 = arith.subf %cst_18, %tmp78 : tensor<8x64xf32, #blocked> loc(#loc270) + %tmp83 = arith.addi %tmp17, %cst_8 : tensor<1x64xi32, #blocked> loc(#loc271) + %tmp83_117 = tt.broadcast %tmp83 : tensor<1x64xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc272) + %tmp83_118 = arith.addi %tmp83_117, %tmp50_39 : tensor<8x64xi32, #blocked> loc(#loc272) + %tmp83_119 = arith.addi %tmp83_118, %tmp50_41 : tensor<8x64xi32, #blocked> loc(#loc273) + %tmp83_120 = tt.addptr %tmp50_42, %tmp83_119 : tensor<8x64x!tt.ptr, #blocked>, tensor<8x64xi32, #blocked> loc(#loc274) + %tmp83_121 = tt.load %tmp83_120, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked> loc(#loc275) + %tmp83_122 = arith.extf %tmp83_121 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc276) + %tmp88 = arith.mulf %tmp83_122, %tmp75_49 : tensor<8x64xf32, #blocked> loc(#loc277) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr, #blocked>, tensor<1x64xi32, #blocked> loc(#loc278) + %tmp89_123 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr, #blocked> -> tensor<8x64x!tt.ptr, #blocked> loc(#loc278) + %tmp89_124 = tt.load %tmp89_123, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr, #blocked> loc(#loc279) + %tmp89_125 = arith.extf %tmp89_124 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc280) + %tmp91 = arith.mulf %tmp88, %tmp89_125 : tensor<8x64xf32, #blocked> loc(#loc281) + %tmp94 = arith.select %tmp48, %tmp91, %cst_18 : tensor<8x64xi1, #blocked>, tensor<8x64xf32, #blocked> loc(#loc282) + %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<8x64xi1, #blocked>, tensor<8x64xf32, #blocked> loc(#loc296) + %tmp101 = arith.mulf %tmp96_76, %tmp75_50 : tensor<8x64xf32, #blocked1> loc(#loc285) + %tmp104 = tt.broadcast %tmp102_79 : tensor<1x64xf32, #blocked1> -> tensor<8x64xf32, #blocked1> loc(#loc286) + %tmp104_126 = arith.mulf %tmp101, %tmp104 : tensor<8x64xf32, #blocked1> loc(#loc286) + %tmp107 = arith.mulf %tmp104_126, %tmp63_67 : tensor<8x64xf32, #blocked1> loc(#loc287) + %tmp107_127 = ttg.convert_layout %tmp107 : tensor<8x64xf32, #blocked1> -> tensor<8x64xf32, #blocked> loc(#loc287) + %tmp109 = arith.mulf %tmp95, %tmp66_70 : tensor<8x64xf32, #blocked> loc(#loc288) + %tmp110 = arith.addf %tmp107_127, %tmp109 : tensor<8x64xf32, #blocked> loc(#loc289) + %4 = arith.addi %tmp50_55, %1 : tensor<8x64xi32, #blocked1> loc(#loc58) + %5 = tt.addptr %2, %4 : tensor<8x64x!tt.ptr, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc59) + %6 = arith.truncf %tmp68 : tensor<8x64xf32, #blocked> to tensor<8x64xbf16, #blocked> loc(#loc144) + %7 = ttg.convert_layout %6 : tensor<8x64xbf16, #blocked> -> tensor<8x64xbf16, #blocked1> loc(#loc144) + tt.store %5, %7, %tmp50_59 : tensor<8x64x!tt.ptr, #blocked1> loc(#loc144) + %8 = tt.addptr %3, %4 : tensor<8x64x!tt.ptr, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc60) + %9 = arith.truncf %tmp110 : tensor<8x64xf32, #blocked> to tensor<8x64xbf16, #blocked> loc(#loc145) + %10 = ttg.convert_layout %9 : tensor<8x64xbf16, #blocked> -> tensor<8x64xbf16, #blocked1> loc(#loc145) + tt.store %8, %10, %tmp50_59 : tensor<8x64x!tt.ptr, #blocked1> loc(#loc145) + } loc(#loc61) + tt.return loc(#loc146) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc156 = loc("xoffset"(#loc2)) +#loc157 = loc("xoffset"(#loc3)) +#loc158 = loc("xindex"(#loc4)) +#loc159 = loc("xindex"(#loc5)) +#loc160 = loc("r0_base"(#loc6)) +#loc161 = loc("x0"(#loc7)) +#loc162 = loc("x1"(#loc8)) +#loc163 = loc("tmp0"(#loc9)) +#loc164 = loc("tmp0"(#loc10)) +#loc165 = loc("tmp0"(#loc11)) +#loc166 = loc("tmp0"(#loc12)) +#loc167 = loc("tmp0"(#loc13)) +#loc168 = loc("_tmp4"(#loc14)) +#loc169 = loc("r0_index"(#loc15)) +#loc170 = loc("r0_mask"(#loc16)) +#loc171 = loc("tmp0"(#loc17)) +#loc172 = loc("tmp0"(#loc18)) +#loc173 = loc("tmp0"(#loc19)) +#loc174 = loc("tmp6"(#loc20)) +#loc175 = loc("tmp6"(#loc21)) +#loc176 = loc("tmp6"(#loc22)) +#loc177 = loc("tmp6"(#loc23)) +#loc178 = loc("tmp6"(#loc24)) +#loc179 = loc("tmp2"(#loc25)) +#loc180 = loc("tmp5"(#loc26)) +#loc181 = loc("_tmp4"(#loc27)) +#loc182 = loc("tmp8"(#loc28)) +#loc183 = loc("tmp11"(#loc29)) +#loc184 = loc("_tmp10"(#loc30)) +#loc186 = loc("tmp4"(#loc35)) +#loc188 = loc("tmp10"(#loc37)) +#loc189 = loc("tmp50"(#loc38)) +#loc190 = loc("tmp50"(#loc39)) +#loc191 = loc("tmp50"(#loc40)) +#loc192 = loc("tmp50"(#loc41)) +#loc193 = loc("tmp50"(#loc42)) +#loc194 = loc("tmp58"(#loc43)) +#loc195 = loc("tmp63"(#loc44)) +#loc196 = loc("tmp63"(#loc45)) +#loc197 = loc("tmp63"(#loc46)) +#loc198 = loc("tmp66"(#loc47)) +#loc199 = loc("tmp102"(#loc48)) +#loc200 = loc("tmp20"(#loc49)) +#loc201 = loc("tmp22"(#loc50)) +#loc202 = loc("tmp23"(#loc51)) +#loc203 = loc("tmp24"(#loc52)) +#loc204 = loc("tmp72"(#loc53)) +#loc205 = loc("tmp73"(#loc54)) +#loc206 = loc("tmp74"(#loc55)) +#loc207 = loc("tmp75"(#loc56)) +#loc208 = loc("r0_index"(#loc62)) +#loc209 = loc("r0_mask"(#loc63)) +#loc210 = loc("r0_3"(#loc64)) +#loc211 = loc("r0_4"(#loc65)) +#loc212 = loc("tmp50"(#loc66)) +#loc213 = loc("tmp50"(#loc67)) +#loc214 = loc("tmp58"(#loc68)) +#loc215 = loc("tmp58"(#loc69)) +#loc216 = loc("tmp63"(#loc70)) +#loc217 = loc("tmp66"(#loc71)) +#loc218 = loc("tmp96"(#loc72)) +#loc219 = loc("tmp96"(#loc73)) +#loc220 = loc("tmp96"(#loc74)) +#loc221 = loc("tmp96"(#loc75)) +#loc222 = loc("tmp96"(#loc76)) +#loc223 = loc("tmp96"(#loc77)) +#loc224 = loc("tmp102"(#loc78)) +#loc225 = loc("tmp102"(#loc79)) +#loc226 = loc("tmp16"(#loc80)) +#loc227 = loc("tmp17"(#loc81)) +#loc228 = loc("tmp17"(#loc82)) +#loc229 = loc("tmp17"(#loc83)) +#loc230 = loc("tmp17"(#loc84)) +#loc231 = loc("tmp17"(#loc85)) +#loc232 = loc("tmp17"(#loc86)) +#loc233 = loc("tmp17"(#loc87)) +#loc234 = loc("tmp17"(#loc88)) +#loc235 = loc("tmp25"(#loc89)) +#loc236 = loc("tmp25"(#loc90)) +#loc237 = loc("tmp25"(#loc91)) +#loc238 = loc("tmp27"(#loc92)) +#loc239 = loc("tmp29"(#loc93)) +#loc240 = loc("tmp31"(#loc94)) +#loc241 = loc("tmp32"(#loc95)) +#loc242 = loc("tmp35"(#loc96)) +#loc243 = loc("tmp35"(#loc97)) +#loc244 = loc("tmp35"(#loc98)) +#loc245 = loc("tmp35"(#loc99)) +#loc246 = loc("tmp35"(#loc100)) +#loc247 = loc("tmp35"(#loc101)) +#loc248 = loc("tmp42"(#loc102)) +#loc249 = loc("tmp43"(#loc103)) +#loc250 = loc("tmp43"(#loc104)) +#loc251 = loc("tmp43"(#loc105)) +#loc252 = loc("tmp45"(#loc106)) +#loc253 = loc("tmp48"(#loc107)) +#loc254 = loc("tmp49"(#loc108)) +#loc255 = loc("tmp57"(#loc109)) +#loc256 = loc("tmp60"(#loc110)) +#loc257 = loc("tmp64"(#loc111)) +#loc258 = loc("tmp67"(#loc112)) +#loc259 = loc("tmp68"(#loc113)) +#loc260 = loc("tmp70"(#loc114)) +#loc261 = loc("tmp70"(#loc115)) +#loc262 = loc("tmp70"(#loc116)) +#loc263 = loc("tmp70"(#loc117)) +#loc264 = loc("tmp70"(#loc118)) +#loc265 = loc("tmp70"(#loc119)) +#loc266 = loc("tmp76"(#loc120)) +#loc267 = loc("tmp76"(#loc121)) +#loc268 = loc("tmp76"(#loc122)) +#loc269 = loc("tmp78"(#loc123)) +#loc270 = loc("tmp80"(#loc124)) +#loc271 = loc("tmp83"(#loc125)) +#loc272 = loc("tmp83"(#loc126)) +#loc273 = loc("tmp83"(#loc127)) +#loc274 = loc("tmp83"(#loc128)) +#loc275 = loc("tmp83"(#loc129)) +#loc276 = loc("tmp83"(#loc130)) +#loc277 = loc("tmp88"(#loc131)) +#loc278 = loc("tmp89"(#loc132)) +#loc279 = loc("tmp89"(#loc133)) +#loc280 = loc("tmp89"(#loc134)) +#loc281 = loc("tmp91"(#loc135)) +#loc282 = loc("tmp94"(#loc136)) +#loc283 = loc("tmp95"(#loc137)) +#loc284 = loc("tmp82"(#loc138)) +#loc285 = loc("tmp101"(#loc139)) +#loc286 = loc("tmp104"(#loc140)) +#loc287 = loc("tmp107"(#loc141)) +#loc288 = loc("tmp109"(#loc142)) +#loc289 = loc("tmp110"(#loc143)) +#loc290 = loc("_tmp10"(#loc168)) +#loc291 = loc(callsite(#loc32 at #loc185)) +#loc293 = loc(callsite(#loc32 at #loc187)) +#loc295 = loc(fused[#loc254, #loc240]) +#loc296 = loc(fused[#loc283, #loc284]) +#loc297 = loc(callsite(#loc34 at #loc291)) +#loc298 = loc(callsite(#loc34 at #loc293)) diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..724739f9c25dc5e8b33633edf8ff03ad9b391bf3 --- /dev/null +++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir @@ -0,0 +1,520 @@ +#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0) +#loc1 = loc(unknown) +#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25) +#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27) +#loc149 = loc("in_out_ptr0"(#loc)) +#loc150 = loc("in_out_ptr1"(#loc)) +#loc151 = loc("in_ptr0"(#loc)) +#loc152 = loc("in_ptr1"(#loc)) +#loc153 = loc("in_ptr2"(#loc)) +#loc154 = loc("in_ptr3"(#loc)) +#loc155 = loc("in_ptr4"(#loc)) +#loc156 = loc("xnumel"(#loc)) +#loc157 = loc("r0_numel"(#loc)) +#loc189 = loc("tmp4"(#loc35)) +#loc191 = loc("tmp10"(#loc38)) +#loc296 = loc(callsite(#loc1 at #loc189)) +#loc298 = loc(callsite(#loc1 at #loc191)) +module { + tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x64xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x64xbf16> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_1 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc1) + %cst_2 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc1) + %cst_3 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc1) + %cst_6 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc1) + %cst_7 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc1) + %cst_9 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1) + %cst_10 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc1) + %cst_12 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc158) + %xoffset_13 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc159) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc160) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc161) + %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<8x1xi32> loc(#loc162) + %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<8x1xi32> loc(#loc162) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc163) + %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc164) + %x0 = arith.remsi %xindex_16, %cst_12 : tensor<8x1xi32> loc(#loc165) + %x1 = arith.divsi %xindex_16, %cst_12 : tensor<8x1xi32> loc(#loc166) + %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_20 = %cst_11) -> (tensor<8x64xf32>, tensor<8x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc168) + %r0_index_21 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc168) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst_10 : tensor<1x64xi32> loc(#loc169) + %tmp0 = arith.addi %r0_index_21, %cst_9 : tensor<1x64xi32> loc(#loc170) + %tmp0_22 = arith.muli %x0, %cst_8 : tensor<8x1xi32> loc(#loc171) + %tmp0_23 = tt.broadcast %tmp0 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc172) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc172) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<8x64xi32> loc(#loc172) + %tmp0_26 = arith.muli %x1, %cst_7 : tensor<8x1xi32> loc(#loc173) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc174) + %tmp0_28 = arith.addi %tmp0_25, %tmp0_27 : tensor<8x64xi32> loc(#loc174) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc175) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc175) + %tmp0_31 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc176) + %tmp0_32 = tt.load %tmp0_30, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc176) + %tmp0_33 = arith.extf %tmp0_32 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc177) + %tmp6 = tt.broadcast %r0_index_21 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc178) + %tmp6_34 = arith.addi %tmp6, %tmp0_24 : tensor<8x64xi32> loc(#loc178) + %tmp6_35 = arith.addi %tmp6_34, %tmp0_27 : tensor<8x64xi32> loc(#loc179) + %tmp6_36 = tt.addptr %tmp0_29, %tmp6_35 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc180) + %tmp6_37 = tt.load %tmp6_36, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc181) + %tmp6_38 = arith.extf %tmp6_37 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc182) + %tmp2 = arith.mulf %tmp0_33, %tmp0_33 : tensor<8x64xf32> loc(#loc183) + %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<8x64xf32> loc(#loc184) + %_tmp4_39 = arith.select %tmp0_31, %tmp5, %_tmp4 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc185) + %tmp8 = arith.mulf %tmp6_38, %tmp6_38 : tensor<8x64xf32> loc(#loc186) + %tmp11 = arith.addf %_tmp10_20, %tmp8 : tensor<8x64xf32> loc(#loc187) + %_tmp10_40 = arith.select %tmp0_31, %tmp11, %_tmp10_20 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc188) + scf.yield %_tmp4_39, %_tmp10_40 : tensor<8x64xf32>, tensor<8x64xf32> loc(#loc33) + } loc(#loc294) + %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc189)), %tmp4_21: f32 loc(callsite(#loc1 at #loc189))): + %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc299) + tt.reduce.return %tmp4_22 : f32 loc(#loc295) + }) : (tensor<8x64xf32>) -> tensor<8xf32> loc(#loc295) + %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc190) + %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({ + ^bb0(%tmp10_20: f32 loc(callsite(#loc1 at #loc191)), %tmp10_21: f32 loc(callsite(#loc1 at #loc191))): + %tmp10_22 = arith.addf %tmp10_20, %tmp10_21 : f32 loc(#loc300) + tt.reduce.return %tmp10_22 : f32 loc(#loc297) + }) : (tensor<8x64xf32>) -> tensor<8xf32> loc(#loc297) + %tmp10_19 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc192) + scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc193) + %r0_index_20 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc193) + %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x64xi32> loc(#loc194) + %r0_3 = arith.remsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc195) + %r0_4 = arith.divsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc196) + %tmp50 = arith.muli %x0, %cst_8 : tensor<8x1xi32> loc(#loc197) + %tmp50_21 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc198) + %tmp50_22 = tt.broadcast %tmp50 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc198) + %tmp50_23 = arith.addi %tmp50_21, %tmp50_22 : tensor<8x64xi32> loc(#loc198) + %tmp50_24 = arith.muli %x1, %cst_7 : tensor<8x1xi32> loc(#loc199) + %tmp50_25 = tt.broadcast %tmp50_24 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc200) + %tmp50_26 = arith.addi %tmp50_23, %tmp50_25 : tensor<8x64xi32> loc(#loc200) + %tmp50_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc201) + %tmp50_28 = tt.addptr %tmp50_27, %tmp50_26 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc201) + %tmp50_29 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc202) + %tmp50_30 = tt.load %tmp50_28, %tmp50_29, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc202) + %tmp50_31 = arith.extf %tmp50_30 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc203) + %tmp58 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc204) + %tmp58_32 = tt.addptr %tmp58, %r0_index_20 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc204) + %tmp58_33 = tt.load %tmp58_32, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc205) + %tmp58_34 = arith.extf %tmp58_33 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc206) + %tmp63 = arith.muli %x1, %cst_8 : tensor<8x1xi32> loc(#loc207) + %tmp63_35 = tt.broadcast %tmp63 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc208) + %tmp63_36 = arith.addi %tmp50_21, %tmp63_35 : tensor<8x64xi32> loc(#loc208) + %tmp63_37 = tt.splat %in_ptr2 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc209) + %tmp63_38 = tt.addptr %tmp63_37, %tmp63_36 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc209) + %tmp63_39 = tt.load %tmp63_38, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc210) + %tmp66 = tt.splat %in_ptr3 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc211) + %tmp66_40 = tt.addptr %tmp66, %tmp63_36 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc211) + %tmp66_41 = tt.load %tmp66_40, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc212) + %tmp96 = arith.addi %r0_index_20, %cst_9 : tensor<1x64xi32> loc(#loc213) + %tmp96_42 = tt.broadcast %tmp96 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc214) + %tmp96_43 = arith.addi %tmp96_42, %tmp50_22 : tensor<8x64xi32> loc(#loc214) + %tmp96_44 = arith.addi %tmp96_43, %tmp50_25 : tensor<8x64xi32> loc(#loc215) + %tmp96_45 = tt.addptr %tmp50_27, %tmp96_44 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc216) + %tmp96_46 = tt.load %tmp96_45, %tmp50_29, %cst_0 evictionPolicy = evict_first : tensor<8x64x!tt.ptr> loc(#loc217) + %tmp96_47 = arith.extf %tmp96_46 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc218) + %tmp102 = tt.splat %in_ptr4 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc219) + %tmp102_48 = tt.addptr %tmp102, %r0_index_20 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc219) + %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr> loc(#loc220) + %tmp102_50 = arith.extf %tmp102_49 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc221) + %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc222) + %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc222) + %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x64xi32> loc(#loc223) + %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x64xi32> loc(#loc224) + %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc225) + %tmp17_54 = arith.addi %tmp17_53, %tmp50_22 : tensor<8x64xi32> loc(#loc225) + %tmp17_55 = arith.addi %tmp17_54, %tmp50_25 : tensor<8x64xi32> loc(#loc226) + %tmp17_56 = tt.addptr %tmp50_27, %tmp17_55 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc227) + %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x64xi1> loc(#loc228) + %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc229) + %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc229) + %tmp17_60 = arith.extf %tmp17_59 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc230) + %tmp20 = arith.divf %tmp10_19, %cst_3 : tensor<8x1xf32> loc(#loc231) + %tmp22 = arith.addf %tmp20, %cst_2 : tensor<8x1xf32> loc(#loc232) + %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc233) + %tmp24 = tt.broadcast %tmp23 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc234) + %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<8x64xf32> loc(#loc234) + %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc235) + %tmp25_62 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr> -> tensor<8x64x!tt.ptr> loc(#loc235) + %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc236) + %tmp25_64 = arith.extf %tmp25_63 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc237) + %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<8x64xf32> loc(#loc238) + %tmp29 = arith.subf %cst_11, %tmp27 : tensor<8x64xf32> loc(#loc239) + %tmp31 = tt.broadcast %tmp16_51 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc240) + %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc240) + %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc241) + %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc242) + %tmp35_66 = arith.addi %tmp35, %tmp50_22 : tensor<8x64xi32> loc(#loc242) + %tmp35_67 = arith.addi %tmp35_66, %tmp50_25 : tensor<8x64xi32> loc(#loc243) + %tmp35_68 = tt.addptr %tmp50_27, %tmp35_67 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc244) + %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x64xi1> loc(#loc245) + %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc246) + %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc246) + %tmp35_72 = arith.extf %tmp35_71 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc247) + %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<8x64xf32> loc(#loc248) + %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc249) + %tmp43_73 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr> -> tensor<8x64x!tt.ptr> loc(#loc249) + %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc250) + %tmp43_75 = arith.extf %tmp43_74 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc251) + %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<8x64xf32> loc(#loc252) + %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc253) + %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc253) + %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc254) + %tmp57 = arith.mulf %tmp50_31, %tmp24 : tensor<8x64xf32> loc(#loc255) + %tmp60 = tt.broadcast %tmp58_34 : tensor<1x64xf32> -> tensor<8x64xf32> loc(#loc256) + %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<8x64xf32> loc(#loc256) + %tmp64 = arith.mulf %tmp60_77, %tmp63_39 : tensor<8x64xf32> loc(#loc257) + %tmp67 = arith.mulf %tmp49, %tmp66_41 : tensor<8x64xf32> loc(#loc258) + %tmp68 = arith.addf %tmp64, %tmp67 : tensor<8x64xf32> loc(#loc259) + %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32> loc(#loc260) + %tmp70_78 = tt.broadcast %tmp70 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc261) + %tmp70_79 = arith.addi %tmp70_78, %tmp50_22 : tensor<8x64xi32> loc(#loc261) + %tmp70_80 = arith.addi %tmp70_79, %tmp50_25 : tensor<8x64xi32> loc(#loc262) + %tmp70_81 = tt.addptr %tmp50_27, %tmp70_80 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc263) + %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc264) + %tmp70_83 = arith.extf %tmp70_82 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc265) + %tmp72 = arith.divf %tmp4_18, %cst_3 : tensor<8x1xf32> loc(#loc266) + %tmp73 = arith.addf %tmp72, %cst_2 : tensor<8x1xf32> loc(#loc267) + %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc268) + %tmp75 = tt.broadcast %tmp74 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc269) + %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<8x64xf32> loc(#loc269) + %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc270) + %tmp76_85 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr> -> tensor<8x64x!tt.ptr> loc(#loc270) + %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc271) + %tmp76_87 = arith.extf %tmp76_86 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc272) + %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<8x64xf32> loc(#loc273) + %tmp80 = arith.subf %cst_11, %tmp78 : tensor<8x64xf32> loc(#loc274) + %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc275) + %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x64xi32> loc(#loc276) + %tmp83_88 = tt.broadcast %tmp83 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc277) + %tmp83_89 = arith.addi %tmp83_88, %tmp50_22 : tensor<8x64xi32> loc(#loc277) + %tmp83_90 = arith.addi %tmp83_89, %tmp50_25 : tensor<8x64xi32> loc(#loc278) + %tmp83_91 = tt.addptr %tmp50_27, %tmp83_90 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc279) + %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc280) + %tmp83_93 = arith.extf %tmp83_92 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc281) + %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<8x64xf32> loc(#loc282) + %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc283) + %tmp89_94 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr> -> tensor<8x64x!tt.ptr> loc(#loc283) + %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr> loc(#loc284) + %tmp89_96 = arith.extf %tmp89_95 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc285) + %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<8x64xf32> loc(#loc286) + %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc287) + %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc288) + %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<8x64xf32> loc(#loc289) + %tmp104 = tt.broadcast %tmp102_50 : tensor<1x64xf32> -> tensor<8x64xf32> loc(#loc290) + %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<8x64xf32> loc(#loc290) + %tmp107 = arith.mulf %tmp104_97, %tmp63_39 : tensor<8x64xf32> loc(#loc291) + %tmp109 = arith.mulf %tmp95, %tmp66_41 : tensor<8x64xf32> loc(#loc292) + %tmp110 = arith.addf %tmp107, %tmp109 : tensor<8x64xf32> loc(#loc293) + %0 = arith.muli %xindex_16, %cst_8 : tensor<8x1xi32> loc(#loc142) + %1 = tt.broadcast %0 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc143) + %2 = arith.addi %tmp50_21, %1 : tensor<8x64xi32> loc(#loc143) + %3 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc144) + %4 = tt.addptr %3, %2 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc144) + %5 = arith.truncf %tmp68 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc145) + tt.store %4, %5, %tmp50_29 : tensor<8x64x!tt.ptr> loc(#loc145) + %6 = tt.splat %in_out_ptr1 : !tt.ptr -> tensor<8x64x!tt.ptr> loc(#loc146) + %7 = tt.addptr %6, %2 : tensor<8x64x!tt.ptr>, tensor<8x64xi32> loc(#loc146) + %8 = arith.truncf %tmp110 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc147) + tt.store %7, %8, %tmp50_29 : tensor<8x64x!tt.ptr> loc(#loc147) + } loc(#loc40) + tt.return loc(#loc148) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28) +#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33) +#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36) +#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44) +#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23) +#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27) +#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37) +#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19) +#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19) +#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43) +#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31) +#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29) +#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41) +#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52) +#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48) +#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63) +#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57) +#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34) +#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68) +#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121) +#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41) +#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50) +#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34) +#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61) +#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114) +#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22) +#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23) +#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40) +#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22) +#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25) +#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42) +#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8) +#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36) +#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15) +#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28) +#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30) +#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43) +#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31) +#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29) +#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27) +#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27) +#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46) +#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42) +#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57) +#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51) +#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35) +#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62) +#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115) +#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35) +#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42) +#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95) +#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46) +#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42) +#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35) +#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51) +#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35) +#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51) +#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42) +#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49) +#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58) +#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35) +#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69) +#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123) +#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36) +#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43) +#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96) +#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24) +#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41) +#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39) +#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48) +#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57) +#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35) +#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78) +#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68) +#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129) +#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25) +#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24) +#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32) +#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24) +#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35) +#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85) +#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146) +#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24) +#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17) +#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39) +#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25) +#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44) +#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53) +#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35) +#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74) +#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64) +#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125) +#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24) +#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35) +#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81) +#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142) +#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24) +#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39) +#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39) +#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24) +#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24) +#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24) +#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24) +#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24) +#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42) +#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51) +#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60) +#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35) +#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71) +#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132) +#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24) +#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24) +#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32) +#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24) +#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35) +#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85) +#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146) +#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24) +#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17) +#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39) +#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42) +#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51) +#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60) +#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35) +#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71) +#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132) +#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24) +#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35) +#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81) +#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142) +#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24) +#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39) +#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39) +#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25) +#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26) +#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26) +#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26) +#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26) +#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43) +#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39) +#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32) +#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55) +#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32) +#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56) +#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4) +#loc158 = loc("xoffset"(#loc2)) +#loc159 = loc("xoffset"(#loc3)) +#loc160 = loc("xindex"(#loc4)) +#loc161 = loc("xindex"(#loc5)) +#loc162 = loc("xindex"(#loc6)) +#loc163 = loc("r0_base"(#loc7)) +#loc164 = loc("r0_base"(#loc8)) +#loc165 = loc("x0"(#loc9)) +#loc166 = loc("x1"(#loc10)) +#loc167 = loc("_tmp4"(#loc11)) +#loc168 = loc("r0_index"(#loc12)) +#loc169 = loc("r0_mask"(#loc13)) +#loc170 = loc("tmp0"(#loc14)) +#loc171 = loc("tmp0"(#loc15)) +#loc172 = loc("tmp0"(#loc16)) +#loc173 = loc("tmp0"(#loc17)) +#loc174 = loc("tmp0"(#loc18)) +#loc175 = loc("tmp0"(#loc19)) +#loc176 = loc("tmp0"(#loc20)) +#loc177 = loc("tmp0"(#loc21)) +#loc178 = loc("tmp6"(#loc22)) +#loc179 = loc("tmp6"(#loc23)) +#loc180 = loc("tmp6"(#loc24)) +#loc181 = loc("tmp6"(#loc25)) +#loc182 = loc("tmp6"(#loc26)) +#loc183 = loc("tmp2"(#loc27)) +#loc184 = loc("tmp5"(#loc28)) +#loc185 = loc("_tmp4"(#loc29)) +#loc186 = loc("tmp8"(#loc30)) +#loc187 = loc("tmp11"(#loc31)) +#loc188 = loc("_tmp10"(#loc32)) +#loc190 = loc("tmp4"(#loc37)) +#loc192 = loc("tmp10"(#loc39)) +#loc193 = loc("r0_index"(#loc41)) +#loc194 = loc("r0_mask"(#loc42)) +#loc195 = loc("r0_3"(#loc43)) +#loc196 = loc("r0_4"(#loc44)) +#loc197 = loc("tmp50"(#loc45)) +#loc198 = loc("tmp50"(#loc46)) +#loc199 = loc("tmp50"(#loc47)) +#loc200 = loc("tmp50"(#loc48)) +#loc201 = loc("tmp50"(#loc49)) +#loc202 = loc("tmp50"(#loc50)) +#loc203 = loc("tmp50"(#loc51)) +#loc204 = loc("tmp58"(#loc52)) +#loc205 = loc("tmp58"(#loc53)) +#loc206 = loc("tmp58"(#loc54)) +#loc207 = loc("tmp63"(#loc55)) +#loc208 = loc("tmp63"(#loc56)) +#loc209 = loc("tmp63"(#loc57)) +#loc210 = loc("tmp63"(#loc58)) +#loc211 = loc("tmp66"(#loc59)) +#loc212 = loc("tmp66"(#loc60)) +#loc213 = loc("tmp96"(#loc61)) +#loc214 = loc("tmp96"(#loc62)) +#loc215 = loc("tmp96"(#loc63)) +#loc216 = loc("tmp96"(#loc64)) +#loc217 = loc("tmp96"(#loc65)) +#loc218 = loc("tmp96"(#loc66)) +#loc219 = loc("tmp102"(#loc67)) +#loc220 = loc("tmp102"(#loc68)) +#loc221 = loc("tmp102"(#loc69)) +#loc222 = loc("tmp16"(#loc70)) +#loc223 = loc("tmp17"(#loc71)) +#loc224 = loc("tmp17"(#loc72)) +#loc225 = loc("tmp17"(#loc73)) +#loc226 = loc("tmp17"(#loc74)) +#loc227 = loc("tmp17"(#loc75)) +#loc228 = loc("tmp17"(#loc76)) +#loc229 = loc("tmp17"(#loc77)) +#loc230 = loc("tmp17"(#loc78)) +#loc231 = loc("tmp20"(#loc79)) +#loc232 = loc("tmp22"(#loc80)) +#loc233 = loc("tmp23"(#loc81)) +#loc234 = loc("tmp24"(#loc82)) +#loc235 = loc("tmp25"(#loc83)) +#loc236 = loc("tmp25"(#loc84)) +#loc237 = loc("tmp25"(#loc85)) +#loc238 = loc("tmp27"(#loc86)) +#loc239 = loc("tmp29"(#loc87)) +#loc240 = loc("tmp31"(#loc88)) +#loc241 = loc("tmp32"(#loc89)) +#loc242 = loc("tmp35"(#loc90)) +#loc243 = loc("tmp35"(#loc91)) +#loc244 = loc("tmp35"(#loc92)) +#loc245 = loc("tmp35"(#loc93)) +#loc246 = loc("tmp35"(#loc94)) +#loc247 = loc("tmp35"(#loc95)) +#loc248 = loc("tmp42"(#loc96)) +#loc249 = loc("tmp43"(#loc97)) +#loc250 = loc("tmp43"(#loc98)) +#loc251 = loc("tmp43"(#loc99)) +#loc252 = loc("tmp45"(#loc100)) +#loc253 = loc("tmp48"(#loc101)) +#loc254 = loc("tmp49"(#loc102)) +#loc255 = loc("tmp57"(#loc103)) +#loc256 = loc("tmp60"(#loc104)) +#loc257 = loc("tmp64"(#loc105)) +#loc258 = loc("tmp67"(#loc106)) +#loc259 = loc("tmp68"(#loc107)) +#loc260 = loc("tmp70"(#loc108)) +#loc261 = loc("tmp70"(#loc109)) +#loc262 = loc("tmp70"(#loc110)) +#loc263 = loc("tmp70"(#loc111)) +#loc264 = loc("tmp70"(#loc112)) +#loc265 = loc("tmp70"(#loc113)) +#loc266 = loc("tmp72"(#loc114)) +#loc267 = loc("tmp73"(#loc115)) +#loc268 = loc("tmp74"(#loc116)) +#loc269 = loc("tmp75"(#loc117)) +#loc270 = loc("tmp76"(#loc118)) +#loc271 = loc("tmp76"(#loc119)) +#loc272 = loc("tmp76"(#loc120)) +#loc273 = loc("tmp78"(#loc121)) +#loc274 = loc("tmp80"(#loc122)) +#loc275 = loc("tmp82"(#loc123)) +#loc276 = loc("tmp83"(#loc124)) +#loc277 = loc("tmp83"(#loc125)) +#loc278 = loc("tmp83"(#loc126)) +#loc279 = loc("tmp83"(#loc127)) +#loc280 = loc("tmp83"(#loc128)) +#loc281 = loc("tmp83"(#loc129)) +#loc282 = loc("tmp88"(#loc130)) +#loc283 = loc("tmp89"(#loc131)) +#loc284 = loc("tmp89"(#loc132)) +#loc285 = loc("tmp89"(#loc133)) +#loc286 = loc("tmp91"(#loc134)) +#loc287 = loc("tmp94"(#loc135)) +#loc288 = loc("tmp95"(#loc136)) +#loc289 = loc("tmp101"(#loc137)) +#loc290 = loc("tmp104"(#loc138)) +#loc291 = loc("tmp107"(#loc139)) +#loc292 = loc("tmp109"(#loc140)) +#loc293 = loc("tmp110"(#loc141)) +#loc294 = loc("_tmp10"(#loc167)) +#loc295 = loc(callsite(#loc34 at #loc189)) +#loc297 = loc(callsite(#loc34 at #loc191)) +#loc299 = loc(callsite(#loc36 at #loc295)) +#loc300 = loc(callsite(#loc36 at #loc297))